import requests
reload(sys)
sys.setdefaultencoding('utf8')
import time
import random
import codecs agents = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"] cookies = [
"SINAGLOBAL=6061592354656.324.1489207743838; un=18240343109; TC-V5-G0=52dad2141fc02c292fc30606953e43ef; wb_cusLike_2140170130=N; _s_tentry=login.sina.com.cn; Apache=5393750164131.485.1511882292296; ULV=1511882292314:55:14:7:5393750164131.485.1511882292296:1511789163477; TC-Page-G0=1e758cd0025b6b0d876f76c087f85f2c; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; login_sid_t=7cbd20d7f5c121ef83f50e3b28a77ed7; cross_origin_proto=SSL; WBStorage=82ca67f06fa80da0|undefined; UOR=,,login.sina.com.cn; WBtopGlobal_register_version=573631b425a602e8; crossidccode=CODE-tc-1EjHEO-2SNIe8-y00Hd0Yq79mGw3l1975ae; SSOLoginState=1511882345; SCF=AvFiX3-W7ubLmZwXrMhoZgCv_3ZXikK7fhjlPKRLjog0OIIQzSqq7xsdv-_GhEe8XWdkHikzsFJyqtvqej6OkaM.; SUB=_2A253GQ45DeThGeRP71IQ9y7NyDyIHXVUb3jxrDV8PUNbmtAKLWrSkW9NTjfYoWTfrO0PkXSICRzowbfjExbQidve; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFaVAdSwLmvOo1VRiSlRa3q5JpX5KzhUgL.FozpSh5pS05pe052dJLoIfMLxKBLBonL122LxKnLB.qL1-z_i--fiKyFi-2Xi--fi-2fiKyFTCH8SFHF1C-4eFH81FHWSE-RebH8SE-4BC-RSFH8SFHFBbHWeEH8SEHWeF-RegUDMJ7t; SUHB=04W-u1HCo6armH; ALF=1543418344; wvr=6"] def readfromtxt(filename):
file = codecs.open(u'D:/pythondata/spider/web/'+filename, "r",'utf-8')
text = file.read()
file.close()
return text def writeintxt(dict,filename):
output = codecs.open(u"D:/pythondata/spider/web/"+filename, 'a+','utf-8')
for i, list in dict.items():
comment_str = ""
for l in list:
comment_str = comment_str + l.__str__().replace('$$','') + "####"
output.write(i+"####"+comment_str+'
output.close()
user_agent = random.choice(agents)
cookies = random.choice(cookies)
headers = {
'User-agent' : user_agent,
'Host' : 'm.weibo.cn',
'Accept' : 'application/json, text/plain, */*',
'Accept-Language' : 'zh-CN,zh;q=0.8',
'Accept-Encoding' : 'gzip, deflate, sdch, br',
'Referer' : 'https://m.weibo.cn/u/**********',
'cookie' : cookies,
'Connection' : 'keep-alive',
}##***自己替换为可用的新浪微博id base_url = 'https://m.weibo.cn/api/comments/show?id='
weibo_id_list = readfromtxt('weibo_id.txt').split('
result_dict = {}
for weibo_id in weibo_id_list:
try:
record_list = []
i=1
SIGN = 1
while(SIGN):
# url = base_url + weibo_id.split(',')[1] + '&page=' + str(i)
url = base_url + str(weibo_id) + '&page=' + str(i)
resp = requests.get(str(url), headers=headers, timeout=200)
jsondata = resp.json()
if jsondata.get('ok') == 1:
SIGN = 1
i = i + 1
data = jsondata.get('data')
for d in data.get('data'):
comment = d.get('text').replace('$$','')
# like_count = d.get('like_counts')
# user_id = d.get("user").get('id')
# user_name = d.get("user").get('screen_name').replace('$$','')
# one_record = user_id.__str__()+'$$'+like_count.__str__()+'$$'+user_name.__str__()+'$$'+ comment.__str__()
record_list.append(comment)
else:
SIGN = 0 result_dict[weibo_id]=record_list
time.sleep(random.randint(2,3))
except:
#print(traceback.print_exc())
print(weibo_id)
print('*'*100)
pass
print("ok") writeintxt(result_dict,'comment1.txt') 基本思路是将要爬的几条微博放入txt文件中,让程序逐条自动爬取。
这样我们就获得了一个爬取后的评论dict——result_dict,以及最后一条微博的评论list—— record_list。