爬虫

发布于 2021-04-17  305 次阅读


爬取微博评论

import requests
import json
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
    'Cookie': 'BAIDU_SSP_lcr=https://www.baidu.com/link?url=ghdyIQEzRiB1cPlr4a5NN9nqpDmgJBtFGX_VNNUsbea&wd=&eqid=fe485bfd0037a1d600000003607a303a; SUB=_2A25NfkDSDeRhGeVN6VIT9y3LyDmIHXVugWCarDV6PUJbkdAKLWbHkW1NTGkubBs2Tq-yRFynBFAHt5Hwz-KJN7Ho; _T_WM=86882345218; MLOGIN=1; WEIBOCN_FROM=1110106030; XSRF-TOKEN=6f2207; M_WEIBOCN_PARAMS=oid%3D4602765680061207%26luicode%3D10000011%26lfid%3D1076031197369013'
}

max_id = ""
text_sum = 0
ping_sum = 0
def get_comment():
    global max_id, text_sum,ping_sum
    max_id_type = 0
    if text_sum >= 14:
        max_id_type = 1
    else:
        max_id_type = 0
    url = 'https://m.weibo.cn/comments/hotflow?id=4602765680061207&mid=4602765680061207&max_id=' + str(
        max_id) + '&max_id_type=' + str(max_id_type)
    print(url)
    html = requests.get(url, headers=headers)
    print(html.content.decode())
    html_text = json.loads(html.text)  # 把获得数据转换成字典格式
    print(html_text)
    for i in html_text['data']['data']:
        ping_sum+=1
        text = i['text']
        # print(text)
        without_img = re.compile(r'<span class="url-icon">.*?</span>', re.S)
        true_text = re.sub(without_img, '', text)  # 去掉链接
        print(true_text)
        with open('D:\\text.txt', "a", encoding="utf8") as f:
            f.write(true_text + '\n')
    max_id = html_text['data']['max_id']
    text_sum += 1
    print(text_sum)
    print(ping_sum)

if __name__ == '__main__':
    # 爬取第一页
    url = 'https://m.weibo.cn/comments/hotflow?id=4602765680061207&mid=4602765680061207&' + '&max_id_type=0'
    html = requests.get(url, headers=headers)
    html_text = json.loads(html.text)
    for i in html_text['data']['data']:
        text = i['text']
        print(text)
        without_img = re.compile(r'<span class="url-icon">.*?</span>', re.S)
        true_text = re.sub(without_img, '', text)
        with open('D:\\text.txt', "a", encoding="utf8") as a:
            a.write(true_text + '\n')
    # 爬取第一页往后
    max_id = html_text['data']['max_id']
    print(max_id)
    for i in range(2, 50):
        get_comment()

如果上面代码掉了,附上另一个备份链接爬虫代码链接
制作词云

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import  jieba

path_txt='D://text.txt'
f = open(path_txt,'r',encoding='UTF-8').read()

cut_text = " ".join(jieba.cut(f))

wordcloud = WordCloud(
   #设置字体
   font_path="C:/Windows/Fonts/simfang.ttf",
   #设置了背景,宽高
   background_color="white",width=1500,height=1080).generate(cut_text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

爬取的txt(内容关于汪峰女儿(随便选的)),一共是780多条

制作的词云


"穷且益坚,不坠青云之志"