发布JD_comments 1.0版本

Viper373 · May 3, 2024 · 02b01de · 02b01de
commit 02b01de
Show file tree

Hide file tree

Showing 18 changed files with 260,254 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Nong-Yi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,34 @@
+<h1>python爬取京东商品评论数据</h1>
+
+> 作者：Nong-Yi（本人属于借鉴并改进，如有侵权请联系删除，谢谢）
+>
+> 版本：1.1.0
+>
+> 版权：©️Nong-Yi（以下内容为原作者原创，转载请注明出处）
+
+## 改进（所有更新内容均在原作者基础上增加）
+- ✅增加了随机UA头，构建了Cookie池(jd_cookies.py)，为爬取多种商品做好防风控
+- ✅增加了data目录，用于存储爬取到的数据
+- ✅将每一条评论构建为字典，append到列表中，最后将列表写入CSV文件
+- ❗❗（特别注意）经本人多次测试，最终爬取的单个商品评论数据中有重复部分，建议使用drop_duplicates()去重，不知道的可自行百度，非常简单
+- ❗❗（特别注意）cookie不稳定，容易过期或被服务器拒绝，导致报KeyError：'maxPage'错误。最好不要让cookie池中的账号退出登录，也可人工点击页面的“商品评价”处，手动过服务器的验证（旋转验证码）
+- ❗❗（特别注意）单个商品规格的评论可能会报KeyError：'comments'错误，目前暂未解决
+- 🚩TODO: 
+  - 1、增加代理池
+  - 2、优化各个函数，重写变量的命名
+  - 3、使用rich库代替tqdm模块
+
+<h2>模块</h2>
+<p>模块使用了re、httpx这些库</p>
+<p>httpx模块是一个可以发送网络请求的模块,与requests库相似但有一个requests库没有的功能,就是httpx可以发送http2协议的请求</p>
+<p>re库是一个正则表达式的一个工具</p>
+<p>关于这两个库大家可以去官网了解我就不过多介绍了</p>
+
+<pre># 安装方式(安装了可以跳过):
+pip install httpx
+# 上面安装的httpx没办法使用http2请求我们还需要继续安装下面这个
+pip install httpx[http2]
+# 这样就可以使用http2协议进行请求了
+pip install re</pre>
+
+<h3>作者的网站地址：www.nong-yi.cn</h3>
diff --git a/data/3CE九宫格九色眼影盘秋风落叶OVERTAKE橘棕盘哑光大地色生日礼物.csv b/data/3CE九宫格九色眼影盘秋风落叶OVERTAKE橘棕盘哑光大地色生日礼物.csv
diff --git a/data/FOREVER·KEY腮红泥10号云柔微醺女腮红膏裸妆自然立体提亮三合一.csv b/data/FOREVER·KEY腮红泥10号云柔微醺女腮红膏裸妆自然立体提亮三合一.csv
diff --git a/data/SK-II神仙水230ml+氨基酸洗面奶120g+清莹露160ml化妆品套装母亲节礼物.csv b/data/SK-II神仙水230ml+氨基酸洗面奶120g+清莹露160ml化妆品套装母亲节礼物.csv
diff --git a/data/SK-II神仙水75ml精华sk2护肤品套装化妆品礼盒skii生日礼物送女友.csv b/data/SK-II神仙水75ml精华sk2护肤品套装化妆品礼盒skii生日礼物送女友.csv
diff --git a/data/YSL圣罗兰小金条口红.csv b/data/YSL圣罗兰小金条口红.csv
diff --git a/data/兰蔻口红哑光雾面唇膏.csv b/data/兰蔻口红哑光雾面唇膏.csv
diff --git a/data/卡姿兰（Carslan）大眼睛持久液体眼线笔防水防汗不晕染眼线液笔.csv b/data/卡姿兰（Carslan）大眼睛持久液体眼线笔防水防汗不晕染眼线液笔.csv
diff --git a/data/卡姿兰（Carslan）小奶猫粉底液遮瑕持久保湿提亮不脱妆干皮混干奶油肌.csv b/data/卡姿兰（Carslan）小奶猫粉底液遮瑕持久保湿提亮不脱妆干皮混干奶油肌.csv
diff --git a/data/卡姿兰（Carslan）灵动大眼四色眼影盘.csv b/data/卡姿兰（Carslan）灵动大眼四色眼影盘.csv
diff --git a/data/小奥汀（littleondine）眼线笔绚彩玩色眼线液笔.csv b/data/小奥汀（littleondine）眼线笔绚彩玩色眼线液笔.csv
diff --git a/data/小奥汀（littleondine）腮红怦怦腮红膏.csv b/data/小奥汀（littleondine）腮红怦怦腮红膏.csv
diff --git a/data/恋火（Passional Lover）PL蹭不掉粉底液升级版.csv b/data/恋火（Passional Lover）PL蹭不掉粉底液升级版.csv
diff --git a/data/橘朵（Judydoll）单色腮红多用膏眼影高光修容哑光高光提亮提气色.csv b/data/橘朵（Judydoll）单色腮红多用膏眼影高光修容哑光高光提亮提气色.csv
diff --git a/data/迪奥(DIOR)烈艳蓝金口红.csv b/data/迪奥(DIOR)烈艳蓝金口红.csv
diff --git a/jd_cookies.py b/jd_cookies.py
@@ -0,0 +1,4 @@
+COOKIES_LIST = [
+    # Viper3
+    "__jdu=1708653980687272576454; shshshfpa=e06713c9-50ca-abad-b1cd-3ad465c28803-1708691725; shshshfpx=e06713c9-50ca-abad-b1cd-3ad465c28803-1708691725; pinId=Dusx94KFDmYhD-3Yboc6LQ; pin=jd_vQgsEnIZspMn; unick=%E4%BD%A0%E4%BB%AC%E6%89%93%E5%9B%A2%E6%88%91%E5%8D%96%E8%90%8C; _tp=Zmq7%2FP8P7IGN6KhKyt6cZQ%3D%3D; _pst=jd_vQgsEnIZspMn; b_webp=1; b_avif=1; autoOpenApp_downCloseDate_auto=1712933137877_1800000; b_dh=951; b_dpr=1; b_dw=1850; __jdv=76161171|direct|-|none|-|1714420442639; 3AB9D23F7A4B3CSS=jdd03J27IVEQH32WSMEDRLBC23DAU3AF25GTMWOGGUPXM66CWQF5VMZKCVBD4U2GKSVGOCI7GDTOBVKGIEODPUKBHKMNDYIAAAAMPFNV6ULQAAAAADFFTS72EYW5Y2UX; _gia_d=1; PCSYCityID=CN_620000_621000_0; 3AB9D23F7A4B3C9B=J27IVEQH32WSMEDRLBC23DAU3AF25GTMWOGGUPXM66CWQF5VMZKCVBD4U2GKSVGOCI7GDTOBVKGIEODPUKBHKMNDYI; TrackID=1G-MmCrg2rDLoozbyXNVPMHSTQaN4nAWOrk4GtuEECd1_PXWL1D6OCP8a6bejS0FD3yZ-rJegmcbXmkcXpd4AMwWgoR5ZhwB9oyQnfKrkS_Mpm9BawUZytrOAA7QPQD8J; thor=82E7DAD074A2F55D0EF704FCF3C4DD87486776F9CCBB1560F7C638540DBADD9342225CC9BFD1053BC83604AEEF4A57B8F1405E13C8B64015794C58FD4B96C4D9DD3C86598DFF1902240F0921C05BD6F4B311AA5150951F998A590944280F869132C14B1A82DCAB53C2AABAE44D14E61C8FE5D7E8F9196EE279D6874A8944421966CA5216652FD45426681117E7AC0CCDF4BE13E26232C6AAC8EC7FF107436168; flash=2_J9DOQAkN6bg0ghyazpOXqiMmc-ibFHz2oEY2V1H7kXG7WfDf-DnxwPk3wH31EevBbHhevrEjEXrITrTbFcbPqPONgb3Y4MHEOnbMufUOOf4VcTjLu-5VGv5fkSP7KBfLZesbzaQzs_HKCITpbp6HU_DaLrKRaevW9glAiOez1OP*; ceshi3.com=000; token=8f638c75d04c119192a1b76d299b6ca8,3,952455; mt_xid=V2_52007VwMUU1RfVlgXQBhbDGEAFFFZXlVfG0wpVQxvVkYGCFhOU09BH0AAbwUUTlVaBl8DGRkPVjBTEVZaXABaL0oYXwB7AhdOX1lDWx1CHVkOZwQiUG1YYlkeShFeAmYAF1taaFdbFk8%3D; __tk=d46abb6fc1ab955839c2e9cce2415fb8,3,952455; jsavif=1; __jda=181111935.1708653980687272576454.1708653980.1712933112.1714420443.6; __jdc=181111935; areaId=28; ipLoc-djd=28-2525-2529-17638; shshshfpb=BApXcEb1kKOpAkb3Zoo1FGQMEU6PeBDnZBkooEAp09xJ1MrKqxYO2; __jdb=181111935.6.1708653980687272576454|6.1714420443"
+]
diff --git a/main.py b/main.py
@@ -0,0 +1,228 @@
+#  Copyright (c) 2024. Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+#  Morbi non lorem porttitor neque feugiat blandit. Ut vitae ipsum eget quam lacinia accumsan.
+#  Etiam sed turpis ac ipsum condimentum fringilla. Maecenas magna.
+#  Proin dapibus sapien vel ante. Aliquam erat volutpat. Pellentesque sagittis ligula eget metus.
+#  Vestibulum commodo. Ut rhoncus gravida arcu.
+import csv
+import os
+import time
+import random
+import httpx
+import re
+from fake_useragent import UserAgent
+from tqdm import tqdm
+
+import jd_cookies
+
+global name
+
+
+def get_dispose_comments(alldata):
+    # 通过接受到的json数据
+    comments = alldata['comments']  # 评论全部数据
+    results = []
+    for data in comments:
+        if "location" in data:
+            content = data['content']  # 评论
+            creationtime = data['creationTime']  # 时间
+            location = data['location']  # ip
+            productcolor = data['productColor']  # 商品款式
+            result_dic = {
+                'content': content,
+                'creationtime': creationtime,
+                'location': location,
+                'productcolor': productcolor,
+            }
+            results.append(result_dic)  # 这里可以方便大家保存数据
+        else:
+            content = data['content']  # 评论
+            creationtime = data['creationTime']  # 时间
+            productcolor = data['productColor']  # 商品款式
+            location = '无'
+            result_dic = {
+                'content': content,
+                'creationtime': creationtime,
+                'location': location,
+                'productcolor': productcolor,
+            }
+            results.append(result_dic)  # 这里可以方便大家保存数据
+        # 检查csv文件是否存在,不存在则创建
+        file_exists = os.path.isfile(f'data/{name}.csv')
+        csvfile = open(f'data/{name}.csv', 'a', newline='', encoding='utf-8')
+        fieldnames = ['content', 'creationtime', 'location', 'productcolor']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        if not file_exists:
+            writer.writeheader()  # 写入表头，如果文件不存在的话
+        writer.writerows(results)
+
+
+def get_forms_comments(productid, client):
+    url = 'https://api.m.jd.com/?appid=item-v3'
+    # 获取时间戳(毫秒)
+    timestamp = int(time.time() * 1000)
+    # 构造新的表单
+    data = {
+        'functionId': 'pc_club_productPageComments',
+        'client': 'pc',
+        'clientVersion': '1.0.0',
+        't': timestamp,  # 时间戳
+        'loginType': '3',
+        'uuid': '181111935.1706791191786871307752.1706791191.1712766948.1712794165.2',
+        'productId': productid,  # 商品编码
+        'score': '0',
+        'sortType': '5',
+        'page': '0',
+        'pageSize': '10',
+        'isShadowSku': '0',
+        'fold': '1',
+        'bbtf': '',
+        'shield': ''
+    }
+    resp = client.get(url, params=data)
+    # 判断状态吗是否为200是则返回json数据和页面最大数,否则重新请求
+    if resp.status_code == 200:
+        alldata = resp.json()
+        maxpage = alldata['maxPage']
+        return alldata, maxpage
+    else:
+        get_forms_comments(productid, client)
+
+
+def get_dispose_comments2(alldata):
+    comments = alldata['comments']  # 评论全部数据
+    results = []
+    for data in comments:
+        if "location" in data:
+            content = data['content']  # 评论
+            creationtime = data['creationTime']  # 时间
+            location = data['location']  # ip
+            productcolor = data['productColor']  # 商品款式
+            result_dic = {
+                'content': content,
+                'creationtime': creationtime,
+                'location': location,
+                'productcolor': productcolor,
+            }
+            results.append(result_dic)  # 这里可以方便大家保存数据
+        else:
+            content = data['content']  # 评论
+            creationtime = data['creationTime']  # 时间
+            productcolor = data['productColor']  # 商品款式
+            location = '无'
+            result_dic = {
+                'content': content,
+                'creationtime': creationtime,
+                'location': location,
+                'productcolor': productcolor,
+            }
+            results.append(result_dic)  # 这里可以方便大家保存数据
+        # 检查csv文件是否存在,不存在则创建
+        file_exists = os.path.isfile(f'data/{name}.csv')
+        csvfile = open(f'data/{name}.csv', 'a', newline='', encoding='utf-8')
+        fieldnames = ['content', 'creationtime', 'location', 'productcolor']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        if not file_exists:
+            writer.writeheader()  # 写入表头，如果文件不存在的话
+        writer.writerows(results)
+
+
+def get_forms_comments2(productid, client, i):
+    # 这里这段代码是为了完成翻页操作
+    url = 'https://api.m.jd.com/?appid=item-v3'
+    timestamp = int(time.time() * 1000)  # 获取时间戳
+    data = {
+        'functionId': 'pc_club_productPageComments',
+        'client': 'pc',
+        'clientVersion': '1.0.0',
+        't': timestamp,  # 时间戳
+        'loginType': '3',
+        'uuid': '181111935.1706791191786871307752.1706791191.1712766948.1712794165.2',
+        'productId': productid,  # 商品编码
+        'score': '0',
+        'sortType': '5',
+        'page': i,  # 翻页
+        'pageSize': '10',
+        'isShadowSku': '0',
+        'rid': '0',
+        'fold': '1',
+        'bbtf': '',
+        'shield': ''
+    }
+    resp = client.get(url, params=data)
+    if resp.status_code == 200:
+        alldata = resp.json()
+        get_dispose_comments2(alldata)
+    else:
+        get_forms_comments2(productid, client, i)
+
+
+def get_crawling_homepage(client, name):
+    url = f'https://search.jd.com/Search?'
+    # 构造表单
+    data = {
+        'keyword': name,
+        'enc': 'utf - 8',
+        'spm': 'a.0.0',
+        'pvid': '1de57a0845254674b2e422004fccbf59'
+    }
+    resp = client.get(url, params=data)
+    html_data = resp.text
+    if resp.status_code == 200:
+        obj = re.compile(r'<div class="p-name p-name-type-2">.*?href="(?P<url>.*?)".*?<em>.*?<font class="skcolor_ljg">(?P<name>.*?)</font>.*?</em>', re.S)
+        page = obj.finditer(html_data)
+        results = []
+        for item in page:
+            url_homepage = 'https' + item.group('url')
+            commodity = item.group('name')
+            productid_1 = url_homepage.split('/')[3]
+            productid_2 = productid_1.split('.')[0]
+            results.append((url_homepage, commodity, productid_2))
+
+        return results  # 返回所有匹配结果的列表
+    else:
+        print("请求失败正在为您重新请求,请求状态码:", resp)
+        time.sleep(1)
+        get_crawling_homepage(client, name)
+
+
+def get_cerebrum():
+    global name
+    ua = UserAgent()  # 初始化UA库
+    name = input('请输入你要查询商品评论的商品名称:')
+    headers = {
+        'User-Agent': ua.random,  # 随机UA
+        'Cookie': random.choice(jd_cookies.COOKIES_LIST),  # cookie池
+        'Referer': 'https://www.jd.com/'
+    }
+    client = httpx.Client(http2=True, headers=headers, timeout=15)
+    # 发送请求获取返回的京东主页html,然后进行re处理匹配需要的数据
+    results = get_crawling_homepage(client=client, name=name)
+    # 循环results获取商品名称页面地址和productId
+    for result in results:
+        url_homepage = result[0].replace("https", "https://")
+        commodity = result[1]
+        productid = result[2]
+        print('-------------------------------------------------------------------------------------------------------------------------------------------------------------------')
+        print('商品名称:' + commodity, '页面地址:' + url_homepage)
+        # 发送请求获取返回的json数据和页面最大数
+        alldata, maxpage = get_forms_comments(productid=productid, client=client)
+        # 判断页面最大数
+        if maxpage == 1:
+            # 发送json数据
+            get_dispose_comments(alldata)
+
+        elif maxpage >= 1:
+            # 发送json数据
+            get_dispose_comments(alldata)
+            maxpage += 1
+            # 使用for循环完成翻页操作
+            for maxpage2 in tqdm(range(1, maxpage), colour='white', desc='正在获取以上链接商品规格的评论'):
+                get_forms_comments2(i=maxpage2, client=client, productid=productid)
+                time.sleep(3)
+
+        elif maxpage == 0:
+            print('没有评论哦~')
+
+
+if __name__ == '__main__':
+    get_cerebrum()