编辑整理:整理来源:抖音,浏览量:2739,时间:2022-10-11 22:14:01
京东手机评论数据采集与分析,手机京东怎么看评价,京东手机评估
import random
import re
import time
import xlsxwriter
from selenium import webdriver
from lxml import etree
import requests
2.构建ua池和ip池# ua池
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x32) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.39 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.26 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X; zh-CN) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/17D50 UCBrowser/12.8.2.1268 Mobile AliApp(TUnionSDK/0.1.20.3)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
'Mozilla/5.0 (Linux; Android 8.1.0; OPPO R11t Build/OPM1.171019.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/76.0.3809.89 Mobile Safari/537.36 T7/11.19 SP-engine/2.15.0 baiduboxapp/11.19.5.10 (Baidu; P1 8.1.0)',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 SP-engine/2.14.0 main%2F1.0 baiduboxapp/11.18.0.16 (Baidu; P2 13.3.1) NABar/0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
]
ua = random.choice(ua_list)
# Ip池
proxy_list = [
{"http" or "https": "124.71.14.222:10002"},
{"http" or "https": '60.167.133.17:1133'},
{"http" or "https": '183.0.203.167:8118'},
{"http" or "https": '111.231.86.149:7890'},
{"http" or "https": "183.0.203.167:8118"},
{"http" or "https": '163.125.222.12:8118'},
{"http" or "https": '111.59.199.58:8118'},
]
proxies = random.choice(proxy_list)
3.获取商品链接和idword = input('请输入你要获取的商品:', )
page = input('请输入商品页数:',)
# 获取商品链接和id
def get_link():
links = []
skus_id = []
for i in range(int(page)):
url = f'https://search.jd.com/Search?keyword={word}&wq={word}&page={i}'
headers = {
"user-agent": ua,
}
res = requests.get(url=url, headers=headers, proxies=proxies).text
time.sleep(0.5)
# print(res)
# 提取商品链接并进行拼接操作
html = etree.HTML(res)
link = html.xpath('//*[@id="J_goodsList"]/ul/li[*]/div/div[3]/a/@href')
link = ['https:' + k for k in link]
for l in link:
links.appd(l)
# 提取商品id
sku_id = [re.findall('\d+', i)[0] for i in link]
for s in sku_id:
skus_id.appd(s)
print(f'第{i+1}页。')
print(links)
goods(links, skus_id)
4.获取商品详情数据# 获取商品详情
def goods(links, skus_id):
goo = []
pict = 0
for i in range(len(links)):
headers = {
"User-Agent": ua,
'referer': 'https://search.jd.com/',
}
res = requests.get(url=links[i], headers=headers, proxies=proxies).text
time.sleep(2)
# print(res)
html = etree.HTML(res)
# 店铺名称
title = html.xpath('//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[1]/div/a/@title')
print(title)
# 品牌
brand = html.xpath('//*[@id="parameter-brand"]/li/@title')
print(brand)
# 商品编号
serial = html.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[2]/text()')
serial = [serial[0].split(':')[-1]]
print(serial)
# 正式商品名称
official = html.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[1]/text()')
official = [official[0].split(':')[-1].strip()]
print(official)
# 网页商品名称
name = html.xpath('/html/body/div[6]/div/div[2]/div[1]/text()')
if len(name) == 1:
name = [name[0].strip()]
elif len(name) == 2:
name = [name[1].strip()]
print(name)
# 商品第一张主图片
picture = ['https:' + html.xpath('//*[@id="spec-img"]/@data-origin')[0]]
print(picture)
res2 = requests.get(url=picture[0], headers=headers)
with op(f'D:\pythonproject\python项目爬虫\接单\京东商品评价获取(接单考核)\商品图片/{pict}.jpg', 'wb')as f:
f.write(res2.content)
pict += 1
# 京东价,请求价格信息json
p = requests.get('https://p.3.cn/prices/mgets?skuIds=J_' + skus_id[i], headers=headers, proxies=proxies).text
print(p)
price = re.findall('"p":"(.*?)","op"', p)
print(price)
# 优惠劵和促销
options = webdriver.ChromeOptions()# 无界面模式
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get(links[i])
time.sleep(1)
# 获取源代码
data = driver.page_source
time.sleep(0.5)
driver.close()
driver.quit()
# 促销
html2 = etree.HTML(data)
promotion1 = html2.xpath('//*[@id="prom"]/div/div[1]/em[2]/text()')
promotion2 = html2.xpath('//*[@id="prom"]/div/div[2]/em[2]/text()')
if promotion1 == [] and promotion2 == []:
promotion = ['暂无促销信息']
elif promotion1 == [] and promotion2 != []:
promotion = promotion2
elif promotion2 == [] and promotion1 != []:
promotion = promotion1
else:
promotion = [promotion1[0], promotion2[0]]
print(promotion)
# 优惠劵信息
coupon = html2.xpath('//*[@id="summary-quan"]/div[2]/dl/dd/a/span/span/text()')
if coupon == []:
coupon = ['暂无可领的优惠券']
print(coupon)
# 累计评价
comm_url = f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={skus_id[i]}'
comment_headers = {
'user-agent': ua,
}
res_js = requests.get(url=comm_url, headers=comment_headers, proxies=proxies).text
comment = re.findall('"CommentCountStr":"(.*?)","CommentCount":', res_js)
print(comment)
for g in zip(title, brand, serial, official, name, price, promotion, coupon, comment, picture):
goo.appd(g)
print(f'第{i+1}件商品打印完成。')
print(goo)
save(goo)
5.数据保存,并通过xlsxwrite进行图片保存
# 数据保存
def save(goo):
# 创建工作簿
workbook = xlsxwriter.Workbook('京东商品详情.xlsx')
# 创建工作表
worksheet = workbook.add_worksheet(word)
# 大部分样式如下:
format = {
# 'font_size': 10, # 字体大小
'bold': True, # 是否粗体
# 'bg_color': '#101010', # 表格背景颜色
# 'fg_color': '#00FF00',
# 'font_color': '#0000FF', # 字体颜色
'align': 'center', # 水平居中对齐
'valign': 'vcenter', # 垂直居中对齐
# 'num_format': 'yyyy-mm-dd H:M:S',# 设置日期格式
# 后面参数是线条宽度
'border': 1, # 边框宽度
'top': 1, # 上边框
'left': 1, # 左边框
'right': 1, # 右边框
'bottom': 1 # 底边框
}
style = workbook.add_format(format)
# 写入图片
a = 0
worksheet.set_column(9, 9, 350) # 设置列宽
for i in range(len(goo)):
worksheet.set_row(i + 1, 350) # 设置行高350
worksheet.insert_image(i + 1, 9, f'D:\pythonproject\python项目爬虫\接单\京东商品评价获取(接单考核)\商品图片/{a}.jpg', {'url': goo[i][-1]})
a += 1
# 写入数据
col = ('店铺名称', '品牌', '商品编号', '正式商品名称', '网页商品名称', '京东价', '促销', '优惠劵', '累计评价', '商品第一张主图片',)
for i in range(len(col)):
worksheet.write(0, i, col[i])
for i in range(len(goo)):
for c in range(len(col) - 1):
worksheet.write(i + 1, c, goo[i][c], style)
workbook.close()
6.开启程序
if __name__ == '__main__':
get_link()
原文链接:https://blog.csdn.net/weixin_62871152/article/details/121457846