-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2024 Nong-Yi | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
<h1>python爬取京东商品评论数据</h1> | ||
|
||
> 作者:Nong-Yi(本人属于借鉴并改进,如有侵权请联系删除,谢谢) | ||
> | ||
> 版本:1.1.0 | ||
> | ||
> 版权:©️Nong-Yi(以下内容为原作者原创,转载请注明出处) | ||
## 改进(所有更新内容均在原作者基础上增加) | ||
- ✅增加了随机UA头,构建了Cookie池(jd_cookies.py),为爬取多种商品做好防风控 | ||
- ✅增加了data目录,用于存储爬取到的数据 | ||
- ✅将每一条评论构建为字典,append到列表中,最后将列表写入CSV文件 | ||
- ❗❗(特别注意)经本人多次测试,最终爬取的单个商品评论数据中有重复部分,建议使用drop_duplicates()去重,不知道的可自行百度,非常简单 | ||
- ❗❗(特别注意)cookie不稳定,容易过期或被服务器拒绝,导致报KeyError:'maxPage'错误。最好不要让cookie池中的账号退出登录,也可人工点击页面的“商品评价”处,手动过服务器的验证(旋转验证码) | ||
- ❗❗(特别注意)单个商品规格的评论可能会报KeyError:'comments'错误,目前暂未解决 | ||
- 🚩TODO: | ||
- 1、增加代理池 | ||
- 2、优化各个函数,重写变量的命名 | ||
- 3、使用rich库代替tqdm模块 | ||
|
||
<h2>模块</h2> | ||
<p>模块使用了re、httpx这些库</p> | ||
<p>httpx模块是一个可以发送网络请求的模块,与requests库相似但有一个requests库没有的功能,就是httpx可以发送http2协议的请求</p> | ||
<p>re库是一个正则表达式的一个工具</p> | ||
<p>关于这两个库大家可以去官网了解我就不过多介绍了</p> | ||
|
||
<pre># 安装方式(安装了可以跳过): | ||
pip install httpx | ||
# 上面安装的httpx没办法使用http2请求我们还需要继续安装下面这个 | ||
pip install httpx[http2] | ||
# 这样就可以使用http2协议进行请求了 | ||
pip install re</pre> | ||
|
||
<h3>作者的网站地址:www.nong-yi.cn</h3> |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
COOKIES_LIST = [ | ||
# Viper3 | ||
"__jdu=1708653980687272576454; shshshfpa=e06713c9-50ca-abad-b1cd-3ad465c28803-1708691725; shshshfpx=e06713c9-50ca-abad-b1cd-3ad465c28803-1708691725; pinId=Dusx94KFDmYhD-3Yboc6LQ; pin=jd_vQgsEnIZspMn; unick=%E4%BD%A0%E4%BB%AC%E6%89%93%E5%9B%A2%E6%88%91%E5%8D%96%E8%90%8C; _tp=Zmq7%2FP8P7IGN6KhKyt6cZQ%3D%3D; _pst=jd_vQgsEnIZspMn; b_webp=1; b_avif=1; autoOpenApp_downCloseDate_auto=1712933137877_1800000; b_dh=951; b_dpr=1; b_dw=1850; __jdv=76161171|direct|-|none|-|1714420442639; 3AB9D23F7A4B3CSS=jdd03J27IVEQH32WSMEDRLBC23DAU3AF25GTMWOGGUPXM66CWQF5VMZKCVBD4U2GKSVGOCI7GDTOBVKGIEODPUKBHKMNDYIAAAAMPFNV6ULQAAAAADFFTS72EYW5Y2UX; _gia_d=1; PCSYCityID=CN_620000_621000_0; 3AB9D23F7A4B3C9B=J27IVEQH32WSMEDRLBC23DAU3AF25GTMWOGGUPXM66CWQF5VMZKCVBD4U2GKSVGOCI7GDTOBVKGIEODPUKBHKMNDYI; TrackID=1G-MmCrg2rDLoozbyXNVPMHSTQaN4nAWOrk4GtuEECd1_PXWL1D6OCP8a6bejS0FD3yZ-rJegmcbXmkcXpd4AMwWgoR5ZhwB9oyQnfKrkS_Mpm9BawUZytrOAA7QPQD8J; thor=82E7DAD074A2F55D0EF704FCF3C4DD87486776F9CCBB1560F7C638540DBADD9342225CC9BFD1053BC83604AEEF4A57B8F1405E13C8B64015794C58FD4B96C4D9DD3C86598DFF1902240F0921C05BD6F4B311AA5150951F998A590944280F869132C14B1A82DCAB53C2AABAE44D14E61C8FE5D7E8F9196EE279D6874A8944421966CA5216652FD45426681117E7AC0CCDF4BE13E26232C6AAC8EC7FF107436168; flash=2_J9DOQAkN6bg0ghyazpOXqiMmc-ibFHz2oEY2V1H7kXG7WfDf-DnxwPk3wH31EevBbHhevrEjEXrITrTbFcbPqPONgb3Y4MHEOnbMufUOOf4VcTjLu-5VGv5fkSP7KBfLZesbzaQzs_HKCITpbp6HU_DaLrKRaevW9glAiOez1OP*; ceshi3.com=000; token=8f638c75d04c119192a1b76d299b6ca8,3,952455; mt_xid=V2_52007VwMUU1RfVlgXQBhbDGEAFFFZXlVfG0wpVQxvVkYGCFhOU09BH0AAbwUUTlVaBl8DGRkPVjBTEVZaXABaL0oYXwB7AhdOX1lDWx1CHVkOZwQiUG1YYlkeShFeAmYAF1taaFdbFk8%3D; __tk=d46abb6fc1ab955839c2e9cce2415fb8,3,952455; jsavif=1; __jda=181111935.1708653980687272576454.1708653980.1712933112.1714420443.6; __jdc=181111935; areaId=28; ipLoc-djd=28-2525-2529-17638; shshshfpb=BApXcEb1kKOpAkb3Zoo1FGQMEU6PeBDnZBkooEAp09xJ1MrKqxYO2; __jdb=181111935.6.1708653980687272576454|6.1714420443" | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
# Copyright (c) 2024. Lorem ipsum dolor sit amet, consectetur adipiscing elit. | ||
# Morbi non lorem porttitor neque feugiat blandit. Ut vitae ipsum eget quam lacinia accumsan. | ||
# Etiam sed turpis ac ipsum condimentum fringilla. Maecenas magna. | ||
# Proin dapibus sapien vel ante. Aliquam erat volutpat. Pellentesque sagittis ligula eget metus. | ||
# Vestibulum commodo. Ut rhoncus gravida arcu. | ||
import csv | ||
import os | ||
import time | ||
import random | ||
import httpx | ||
import re | ||
from fake_useragent import UserAgent | ||
from tqdm import tqdm | ||
|
||
import jd_cookies | ||
|
||
global name | ||
|
||
|
||
def get_dispose_comments(alldata): | ||
# 通过接受到的json数据 | ||
comments = alldata['comments'] # 评论全部数据 | ||
results = [] | ||
for data in comments: | ||
if "location" in data: | ||
content = data['content'] # 评论 | ||
creationtime = data['creationTime'] # 时间 | ||
location = data['location'] # ip | ||
productcolor = data['productColor'] # 商品款式 | ||
result_dic = { | ||
'content': content, | ||
'creationtime': creationtime, | ||
'location': location, | ||
'productcolor': productcolor, | ||
} | ||
results.append(result_dic) # 这里可以方便大家保存数据 | ||
else: | ||
content = data['content'] # 评论 | ||
creationtime = data['creationTime'] # 时间 | ||
productcolor = data['productColor'] # 商品款式 | ||
location = '无' | ||
result_dic = { | ||
'content': content, | ||
'creationtime': creationtime, | ||
'location': location, | ||
'productcolor': productcolor, | ||
} | ||
results.append(result_dic) # 这里可以方便大家保存数据 | ||
# 检查csv文件是否存在,不存在则创建 | ||
file_exists = os.path.isfile(f'data/{name}.csv') | ||
csvfile = open(f'data/{name}.csv', 'a', newline='', encoding='utf-8') | ||
fieldnames = ['content', 'creationtime', 'location', 'productcolor'] | ||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | ||
if not file_exists: | ||
writer.writeheader() # 写入表头,如果文件不存在的话 | ||
writer.writerows(results) | ||
|
||
|
||
def get_forms_comments(productid, client): | ||
url = 'https://api.m.jd.com/?appid=item-v3' | ||
# 获取时间戳(毫秒) | ||
timestamp = int(time.time() * 1000) | ||
# 构造新的表单 | ||
data = { | ||
'functionId': 'pc_club_productPageComments', | ||
'client': 'pc', | ||
'clientVersion': '1.0.0', | ||
't': timestamp, # 时间戳 | ||
'loginType': '3', | ||
'uuid': '181111935.1706791191786871307752.1706791191.1712766948.1712794165.2', | ||
'productId': productid, # 商品编码 | ||
'score': '0', | ||
'sortType': '5', | ||
'page': '0', | ||
'pageSize': '10', | ||
'isShadowSku': '0', | ||
'fold': '1', | ||
'bbtf': '', | ||
'shield': '' | ||
} | ||
resp = client.get(url, params=data) | ||
# 判断状态吗是否为200是则返回json数据和页面最大数,否则重新请求 | ||
if resp.status_code == 200: | ||
alldata = resp.json() | ||
maxpage = alldata['maxPage'] | ||
return alldata, maxpage | ||
else: | ||
get_forms_comments(productid, client) | ||
|
||
|
||
def get_dispose_comments2(alldata): | ||
comments = alldata['comments'] # 评论全部数据 | ||
results = [] | ||
for data in comments: | ||
if "location" in data: | ||
content = data['content'] # 评论 | ||
creationtime = data['creationTime'] # 时间 | ||
location = data['location'] # ip | ||
productcolor = data['productColor'] # 商品款式 | ||
result_dic = { | ||
'content': content, | ||
'creationtime': creationtime, | ||
'location': location, | ||
'productcolor': productcolor, | ||
} | ||
results.append(result_dic) # 这里可以方便大家保存数据 | ||
else: | ||
content = data['content'] # 评论 | ||
creationtime = data['creationTime'] # 时间 | ||
productcolor = data['productColor'] # 商品款式 | ||
location = '无' | ||
result_dic = { | ||
'content': content, | ||
'creationtime': creationtime, | ||
'location': location, | ||
'productcolor': productcolor, | ||
} | ||
results.append(result_dic) # 这里可以方便大家保存数据 | ||
# 检查csv文件是否存在,不存在则创建 | ||
file_exists = os.path.isfile(f'data/{name}.csv') | ||
csvfile = open(f'data/{name}.csv', 'a', newline='', encoding='utf-8') | ||
fieldnames = ['content', 'creationtime', 'location', 'productcolor'] | ||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | ||
if not file_exists: | ||
writer.writeheader() # 写入表头,如果文件不存在的话 | ||
writer.writerows(results) | ||
|
||
|
||
def get_forms_comments2(productid, client, i): | ||
# 这里这段代码是为了完成翻页操作 | ||
url = 'https://api.m.jd.com/?appid=item-v3' | ||
timestamp = int(time.time() * 1000) # 获取时间戳 | ||
data = { | ||
'functionId': 'pc_club_productPageComments', | ||
'client': 'pc', | ||
'clientVersion': '1.0.0', | ||
't': timestamp, # 时间戳 | ||
'loginType': '3', | ||
'uuid': '181111935.1706791191786871307752.1706791191.1712766948.1712794165.2', | ||
'productId': productid, # 商品编码 | ||
'score': '0', | ||
'sortType': '5', | ||
'page': i, # 翻页 | ||
'pageSize': '10', | ||
'isShadowSku': '0', | ||
'rid': '0', | ||
'fold': '1', | ||
'bbtf': '', | ||
'shield': '' | ||
} | ||
resp = client.get(url, params=data) | ||
if resp.status_code == 200: | ||
alldata = resp.json() | ||
get_dispose_comments2(alldata) | ||
else: | ||
get_forms_comments2(productid, client, i) | ||
|
||
|
||
def get_crawling_homepage(client, name): | ||
url = f'https://search.jd.com/Search?' | ||
# 构造表单 | ||
data = { | ||
'keyword': name, | ||
'enc': 'utf - 8', | ||
'spm': 'a.0.0', | ||
'pvid': '1de57a0845254674b2e422004fccbf59' | ||
} | ||
resp = client.get(url, params=data) | ||
html_data = resp.text | ||
if resp.status_code == 200: | ||
obj = re.compile(r'<div class="p-name p-name-type-2">.*?href="(?P<url>.*?)".*?<em>.*?<font class="skcolor_ljg">(?P<name>.*?)</font>.*?</em>', re.S) | ||
page = obj.finditer(html_data) | ||
results = [] | ||
for item in page: | ||
url_homepage = 'https' + item.group('url') | ||
commodity = item.group('name') | ||
productid_1 = url_homepage.split('/')[3] | ||
productid_2 = productid_1.split('.')[0] | ||
results.append((url_homepage, commodity, productid_2)) | ||
|
||
return results # 返回所有匹配结果的列表 | ||
else: | ||
print("请求失败正在为您重新请求,请求状态码:", resp) | ||
time.sleep(1) | ||
get_crawling_homepage(client, name) | ||
|
||
|
||
def get_cerebrum(): | ||
global name | ||
ua = UserAgent() # 初始化UA库 | ||
name = input('请输入你要查询商品评论的商品名称:') | ||
headers = { | ||
'User-Agent': ua.random, # 随机UA | ||
'Cookie': random.choice(jd_cookies.COOKIES_LIST), # cookie池 | ||
'Referer': 'https://www.jd.com/' | ||
} | ||
client = httpx.Client(http2=True, headers=headers, timeout=15) | ||
# 发送请求获取返回的京东主页html,然后进行re处理匹配需要的数据 | ||
results = get_crawling_homepage(client=client, name=name) | ||
# 循环results获取商品名称页面地址和productId | ||
for result in results: | ||
url_homepage = result[0].replace("https", "https://") | ||
commodity = result[1] | ||
productid = result[2] | ||
print('-------------------------------------------------------------------------------------------------------------------------------------------------------------------') | ||
print('商品名称:' + commodity, '页面地址:' + url_homepage) | ||
# 发送请求获取返回的json数据和页面最大数 | ||
alldata, maxpage = get_forms_comments(productid=productid, client=client) | ||
# 判断页面最大数 | ||
if maxpage == 1: | ||
# 发送json数据 | ||
get_dispose_comments(alldata) | ||
|
||
elif maxpage >= 1: | ||
# 发送json数据 | ||
get_dispose_comments(alldata) | ||
maxpage += 1 | ||
# 使用for循环完成翻页操作 | ||
for maxpage2 in tqdm(range(1, maxpage), colour='white', desc='正在获取以上链接商品规格的评论'): | ||
get_forms_comments2(i=maxpage2, client=client, productid=productid) | ||
time.sleep(3) | ||
|
||
elif maxpage == 0: | ||
print('没有评论哦~') | ||
|
||
|
||
if __name__ == '__main__': | ||
get_cerebrum() |