import requests
from lxml import etree
import re
def xiangqi(data):
html_=requests.get(data['link'])
# print(html_)charset=utf-8"
# print(html_.text)
cr=re.findall('charset="(.*?)"', html_.text)
c=cr.count('gbk')
if c<2:
html_.encoding='gbk'
else:
html_.encoding='utf8'
content=''.join(etree.HTML(html_.text).xpath('//p[@class="one-p"]//text()|//div[@class="content_area"]/p/text()')).replace('\n','')
if content:
return content
url='https://i.news.qq.com/trpc.qqnews_web.pc_base_srv.base_http_proxy/NinjaPageContentSync?pull_urls=news_top_2018'
source=requests.get(url).json()
data={}
for i in source['data']:
data['title']=i['title']
data['link']=i['url']
# print(data)
content=xiangqi(data)
data['content'] = content
print(data)