博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
使用selenium抓取华尔街见闻和新浪财经数据
阅读量:6915 次
发布时间:2019-06-27

本文共 3385 字,大约阅读时间需要 11 分钟。

# 新浪财经数据采集import requestsimport pymongoimport timefrom selenium import webdriverfrom bs4 import BeautifulSoup# from fake_useragent import UserAgent# ua_list = UserAgent()ua_list= 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'def get_hej_news():    """爬取华尔街见闻宏观新闻"""    client = pymongo.MongoClient('localhost', 27017)    news = client['news']    hej_news = news['hej_news']    chromedriver = r"/usr/local/share/chromedriver"    driver = webdriver.Chrome(chromedriver)    # 使用get()方法打开待抓取的URL    driver.get('https://wallstreetcn.com/live/global')    # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距    js = 'window.scrollBy(0,3000)'    driver.execute_script(js)    time.sleep(5)    js = 'window.scrollBy(0,5000)'    driver.execute_script(js)    time.sleep(5)    pages = driver.page_source    soup = BeautifulSoup(pages, 'html.parser')    soup1 = soup.find('div', class_='livenews')    content = soup1.find_all('div', class_='live-item')    for i in content:        new_time = i.find('span', attrs={
'class': 'live-item__time__text'}).get_text(), news = i.find('div', attrs={
'class': 'content-html'}).get_text().strip().replace('\n|//', '') isexit = hej_news.count({
'new_time': new_time}) if isexit != 0: hej_news.remove({
'new_time': new_time}) data = { 'new_time': new_time, 'news': news } hej_news.insert_one(data) driver.close() driver.quit() print('存储华尔街见闻宏观新闻成功')def get_xlcj_news(): """爬取新浪财经突发live板块新闻""" client = pymongo.MongoClient('localhost', 27017) news = client['news'] xlcj_news = news['xlcj_news'] num = 1 while num < 7: chromedriver = r"/usr/local/share/chromedriver" driver = webdriver.Chrome(chromedriver) url = 'http://live.sina.com.cn/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num) # 使用get()方法打开待抓取的URL driver.get(url) # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距 js = 'window.scrollBy(0,3000)' driver.execute_script(js) time.sleep(5) js = 'window.scrollBy(0,5000)' driver.execute_script(js) time.sleep(5) pages = driver.page_source soup = BeautifulSoup(pages, 'html.parser') soup1 = soup.find('div', class_='bd_list') content = soup1.find_all('div', class_='bd_i_og') num += 1 for i in content: news_time = i.find('p', attrs={
'class': 'bd_i_time_c'}).get_text().strip() news_type = i.find('p', attrs={
'class': 'bd_i_tags'}).get_text().strip().replace("\n", "") news = i.find('p', attrs={
'class': 'bd_i_txt_c'}).get_text() print(news_time,news_type,news) isexit = xlcj_news.count({
'news_time': news_time}) if isexit != 0: xlcj_news.remove({
'news_time': news_time}) data = { 'news_time': news_time, 'news_type': news_type, 'news': news } xlcj_news.insert_one(data) driver.close() driver.quit() print('新浪财经突发live板块新闻存储成功')def main(): # his_time = input('请输入要查询的新闻时间(格式:2017-11-2 00:00:00):') # history_time = str(time.mktime(time.strptime(his_time, '%Y-%m-%d %H:%M:%S'))).replace('.0', '') get_hej_news() get_xlcj_news()if __name__ == '__main__': main()复制代码

转载地址:http://csacl.baihongyu.com/

你可能感兴趣的文章
CentOS中service命令与/etc/init.d的关系以及centos7的变化
查看>>
java中读取txt文件获得编码格式方法
查看>>
pt(Percona Toolkit)工具详解:(二)工具介绍
查看>>
智慧人生 仁者见仁 与君共勉
查看>>
org.hibernate.MappingException: Could not get constructor for org.hibernate.pers
查看>>
Apache配置——域名301跳转
查看>>
KVM安装CentOS6.4系统虚拟机
查看>>
POI cell的宽度自适应
查看>>
检查ipa包是否包含手机的方法
查看>>
linux 定时器
查看>>
jquery实现input输入框实时输入触发事件
查看>>
多线程高容错爬头条街拍美图
查看>>
git 解决多个ssh提交到多个不同项目 multiple SSH Keys with different project
查看>>
HMAC
查看>>
apache报Permission denied: make_sock: could not bind to address 解决方案
查看>>
64bit 安装eclipse svn插件
查看>>
RBDDriver -1.1.0 driver is uninitialized
查看>>
道哥:我人生有两大选择,为的却都是同一件事
查看>>
Decision Trees 笔记
查看>>
Ajax初学(3)jQuery实现Ajax
查看>>