Python爬虫代码
from pyquery import pyquery
import requests
import json
import time
pq = pyquery.PyQuery
l = list()
tlist = list()
def get(start, end):
for i in range(start, end+1):
D = pq(requests.get("
https://tieba.baidu.com/f", {"kw": "滚动的天空", "ie": "utf-8", "pn": 50 * i}).content)
print(D)
lis = D.find("#thread_list").find(".j_thread_list")
print(lis)
for e in lis:
data = json.loads(pq(e).attr("data-field"))
if data["reply_num"]>1000:
time.sleep(15)
l.append(data)
F = pq(requests.get("
https://tieba.baidu.com" + pq(e).find("a.j_th_tit").attr("href")).content)
#print(len(ctt))
T = pq(requests.get("
https://tieba.baidu.com" + pq(e).find("a.j_th_tit").attr("href") + "?pn=65535").content)
L1 = F.find(".l_post:first-child .post-tail-wrap > .tail-info:last-child")
lastPost = json.loads(T.find(".l_post").eq(-1).attr("data-field"))
print("last", lastPost)
tlist.append([
pq(e).find("a.j_th_tit").html(),
data["author_nickname"] or data["author_name"],
data["reply_num"],
lastPost["content"]["post_no"],
L1.html()
]);
print(pq(e).find("a.j_th_tit").html(), data["author_name"], data["author_nickname"], data["reply_num"])
time.sleep(60)
def quick_sort(data):
if len(data) >= 2: # 递归入口及出口
mid = data[len(data)//2] # 选取基准值,也可以选取第一个或最后一个元素
left, right = [], [] # 定义基准值左右两侧的列表
data.remove(mid) # 从原始数组中移除基准值
for each in data:
num = each[2]
if num <= mid[2]:
right.append(each)
else:
left.append(each)
return quick_sort(left) + [mid] + quick_sort(right)
else:
return data
trim = quick_sort(tlist)
print(trim)
D = pq("<html></html>")
T = pq("<table></table>").appendTo(pq("<body></body>").appendTo(D))
B = pq("<tbody></tbody>").appendTo(T)
heading = pq("<tr></tr>").appendTo(B)
for name in ("贴子 楼主 回复数 楼层数 发布时间").split(" "):
pq("<th></th>").html(name).appendTo(heading)
if __name__ == "__main__":
try:
get(0,4)
except:
pass
for line in trim:
tr = pq("<tr></tr>").appendTo(B)
for item in line:
pq("<td></td>").html(str(item)).appendTo(tr)
print(D)
with open("table.txt","w+") as file:
file.write(str(D))