# 新浪财经数据采集import requestsimport pymongoimport timefrom selenium import webdriverfrom bs4 import BeautifulSoup# from fake_useragent import UserAgent# ua_list = UserAgent()ua_list= 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'def get_hej_news(): """爬取华尔街见闻宏观新闻""" client = pymongo.MongoClient('localhost', 27017) news = client['news'] hej_news = news['hej_news'] chromedriver = r"/usr/local/share/chromedriver" driver = webdriver.Chrome(chromedriver) # 使用get()方法打开待抓取的URL driver.get('https://wallstreetcn.com/live/global') # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距 js = 'window.scrollBy(0,3000)' driver.execute_script(js) time.sleep(5) js = 'window.scrollBy(0,5000)' driver.execute_script(js) time.sleep(5) pages = driver.page_source soup = BeautifulSoup(pages, 'html.parser') soup1 = soup.find('div', class_='livenews') content = soup1.find_all('div', class_='live-item') for i in content: new_time = i.find('span', attrs={ 'class': 'live-item__time__text'}).get_text(), news = i.find('div', attrs={ 'class': 'content-html'}).get_text().strip().replace('\n|//', '') isexit = hej_news.count({ 'new_time': new_time}) if isexit != 0: hej_news.remove({ 'new_time': new_time}) data = { 'new_time': new_time, 'news': news } hej_news.insert_one(data) driver.close() driver.quit() print('存储华尔街见闻宏观新闻成功')def get_xlcj_news(): """爬取新浪财经突发live板块新闻""" client = pymongo.MongoClient('localhost', 27017) news = client['news'] xlcj_news = news['xlcj_news'] num = 1 while num < 7: chromedriver = r"/usr/local/share/chromedriver" driver = webdriver.Chrome(chromedriver) url = 'http://live.sina.com.cn/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num) # 使用get()方法打开待抓取的URL driver.get(url) # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距 js = 'window.scrollBy(0,3000)' driver.execute_script(js) time.sleep(5) js = 'window.scrollBy(0,5000)' driver.execute_script(js) time.sleep(5) pages = driver.page_source soup = BeautifulSoup(pages, 'html.parser') soup1 = soup.find('div', class_='bd_list') content = soup1.find_all('div', class_='bd_i_og') num += 1 for i in content: news_time = i.find('p', attrs={ 'class': 'bd_i_time_c'}).get_text().strip() news_type = i.find('p', attrs={ 'class': 'bd_i_tags'}).get_text().strip().replace("\n", "") news = i.find('p', attrs={ 'class': 'bd_i_txt_c'}).get_text() print(news_time,news_type,news) isexit = xlcj_news.count({ 'news_time': news_time}) if isexit != 0: xlcj_news.remove({ 'news_time': news_time}) data = { 'news_time': news_time, 'news_type': news_type, 'news': news } xlcj_news.insert_one(data) driver.close() driver.quit() print('新浪财经突发live板块新闻存储成功')def main(): # his_time = input('请输入要查询的新闻时间(格式:2017-11-2 00:00:00):') # history_time = str(time.mktime(time.strptime(his_time, '%Y-%m-%d %H:%M:%S'))).replace('.0', '') get_hej_news() get_xlcj_news()if __name__ == '__main__': main()复制代码