

python实操案例__03--python定向爬虫之淘宝商品比价
source link: https://blog.51cto.com/husheng/5914902
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

写在前面,本教程仅为技术学习与交流使用,禁止恶意使用。
1基本步骤
1.1 对淘宝网页进行提取
模拟浏览器免登录进入
def getHTMLText(url):
try: # 由于淘宝的防爬虫,所以将request对象中相应替换为以下headers,params
headers = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="94", "Microsoft Edge";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://s.taobao.com/search?q=%E6%9C%BA%E8%BD%A6%E7%9A%AE%E8%A1%A3%E7%94%B7&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&hintq=1',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': 'cna=OMhiGRd+/AICAd0LFGYLoihm; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; tracknick=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; enc=UV4uq00pRvAS115Dn7DthWOwe5D6AV9nHQXsVJch3hCixytTM%2Bnfkk3MPgv5mvNKP1kKe11aMri5gJujKX2Iuw%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _uab_collina=163082781604428076180674; t=81d7616e71cef921f6e94e7cb2efd9bc; _m_h5_tk=666c5a1d9aad7068d38f9a0d00ce0909_1634103946468; _m_h5_tk_enc=e4b707ff8bc5ce5590f280185ee1c7a1; _samesite_flag_=true; cookie2=1ae68c702f2d87e883690b227152953a; _tb_token_=30f4e95e774e; xlly_s=1; sgcookie=E100X1xCqb64%2Fty3tobC5vP%2BnrHdP0rXjgV4Jwzf6Ts4UrfERgvoF0JnmAty4rUziGRfo141AGq%2FwaLuRhrP9iByeSBRUje%2FA5EnuRBrorSG81o%3D; unb=2203092264; uc3=lg2=WqG3DMC9VAQiUQ%3D%3D&nk2=UUzw3b%2BKiyHHjw%3D%3D&vt3=F8dCujXCVHUPejzVA1o%3D&id2=UUphyd6OaQNQaQ%3D%3D; csg=7db71943; lgc=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cancelledSubSites=empty; cookie17=UUphyd6OaQNQaQ%3D%3D; dnk=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; skt=7b7a5db42324a37f; existShop=MTYzNDA5Mzg3Ng%3D%3D; uc4=nk4=0%40U24vxrAuiizCBWcoLJ%2BIVGaWd%2BS7&id4=0%40U2grEhApMOjkdfij7K7bulTGrFdl; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=%E7%9A%844e; _nk_=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cookie1=Vvkji3ni8lQVma%2BVcRJoWHkPYonhXUjbfTyhLtxA3Oo%3D; mt=ci=51_1; uc1=existShop=false&cookie14=Uoe3c9wTS7kS9w%3D%3D&pas=0&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=URm48syIIVrSKA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; x5sec=7b227365617263686170703b32223a226133313436303963336336316566653863373538633839623362336434393432434b57616d597347454a72336c756a4a6a50576c6e414561444449794d444d774f5449794e6a51374d5367434d4b6546677037382f2f2f2f2f77453d227d; JSESSIONID=EB6E59140A45846FC76B69D8D13C106B; tfstk=cdyGBiXin5l_V8h3NOM_Au1hR1QRZW5Z5-yUL84dja_EiXwFirAeaACdxVQ57I1..; l=eBr9ZXBIgjdDL77zBOfZnurza77TQIRfguPzaNbMiOCP99C65mclW6ErjQLBCnGVHsIXJ3u9pF2aBVYFxydq0-Y3L3k_J_DmndC..; isg=BEFBuCe_xPYC2ylbixd6fqTdUI1bbrVgmgaBTaOWJ8inimBc6r8BMCaIbP7Mgk2Y',
}
params = (
('spm', 'a21bo.jianhua.201856-taobao-item.2'),
)
r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
1.2 将爬取内容格式化
为方面输出查看,将爬取内容格式化
def parsePage(ils, html):
try:
p = re.findall(r'"view_price":"[\d.]*"', html) # 正则表达式的应用
t = re.findall(r'"raw_title":".*?"', html)
for i in range(len(p)):
price = eval(p[i].split(':')[1])
title = eval(t[i].split(':')[1])
ils.append([price, title])
except:
print("")
1.3 输出查看
def printGoodsList(ils):
temp = "{0:^6}\t{1:^8}\t{2:{3}^16}" # format格式定义
print(temp.format("序号", "价格", "商品名称", chr(12288)))
count = 0
for l in ils:
count = count + 1
print(temp.format(count, l[0], l[1], chr(12288)))
1.4 定义主函数调用
def main():
while True:
sth = input("请输入要查找的商品名称:")
pages = int(input("请输入要爬取的页面数:")) # 不建议爬取页面过多,以免造成页面崩溃
aurl = 'https://s.taobao.com/search?q=' + sth
inlist = []
for i in range(pages): # 对爬取每一页遍历,然后对每一页进行单一处理
try:
url = aurl + '&s=' + str(44 * i) # 以爬取的每一页的URL进行访问爬取
html = getHTMLText(url)
parsePage(inlist, html)
except:
continue
printGoodsList(inlist)
a = input('是否继续进行商品比价y/n')
if a == 'y' or a == 'Y':
continue
else:
break
2 完整代码
import requests
import re
# 由于直接用re库findall函数直接匹配,所以直接跳过网页解析,故不用BeautifulSoup库
# 淘宝网页提取
def getHTMLText(url):
try: # 由于淘宝的防爬虫,所以将request对象中相应替换为以下headers,params
headers = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="94", "Microsoft Edge";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://s.taobao.com/search?q=%E6%9C%BA%E8%BD%A6%E7%9A%AE%E8%A1%A3%E7%94%B7&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&hintq=1',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': 'cna=OMhiGRd+/AICAd0LFGYLoihm; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; tracknick=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; enc=UV4uq00pRvAS115Dn7DthWOwe5D6AV9nHQXsVJch3hCixytTM%2Bnfkk3MPgv5mvNKP1kKe11aMri5gJujKX2Iuw%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _uab_collina=163082781604428076180674; t=81d7616e71cef921f6e94e7cb2efd9bc; _m_h5_tk=666c5a1d9aad7068d38f9a0d00ce0909_1634103946468; _m_h5_tk_enc=e4b707ff8bc5ce5590f280185ee1c7a1; _samesite_flag_=true; cookie2=1ae68c702f2d87e883690b227152953a; _tb_token_=30f4e95e774e; xlly_s=1; sgcookie=E100X1xCqb64%2Fty3tobC5vP%2BnrHdP0rXjgV4Jwzf6Ts4UrfERgvoF0JnmAty4rUziGRfo141AGq%2FwaLuRhrP9iByeSBRUje%2FA5EnuRBrorSG81o%3D; unb=2203092264; uc3=lg2=WqG3DMC9VAQiUQ%3D%3D&nk2=UUzw3b%2BKiyHHjw%3D%3D&vt3=F8dCujXCVHUPejzVA1o%3D&id2=UUphyd6OaQNQaQ%3D%3D; csg=7db71943; lgc=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cancelledSubSites=empty; cookie17=UUphyd6OaQNQaQ%3D%3D; dnk=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; skt=7b7a5db42324a37f; existShop=MTYzNDA5Mzg3Ng%3D%3D; uc4=nk4=0%40U24vxrAuiizCBWcoLJ%2BIVGaWd%2BS7&id4=0%40U2grEhApMOjkdfij7K7bulTGrFdl; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=%E7%9A%844e; _nk_=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cookie1=Vvkji3ni8lQVma%2BVcRJoWHkPYonhXUjbfTyhLtxA3Oo%3D; mt=ci=51_1; uc1=existShop=false&cookie14=Uoe3c9wTS7kS9w%3D%3D&pas=0&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=URm48syIIVrSKA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; x5sec=7b227365617263686170703b32223a226133313436303963336336316566653863373538633839623362336434393432434b57616d597347454a72336c756a4a6a50576c6e414561444449794d444d774f5449794e6a51374d5367434d4b6546677037382f2f2f2f2f77453d227d; JSESSIONID=EB6E59140A45846FC76B69D8D13C106B; tfstk=cdyGBiXin5l_V8h3NOM_Au1hR1QRZW5Z5-yUL84dja_EiXwFirAeaACdxVQ57I1..; l=eBr9ZXBIgjdDL77zBOfZnurza77TQIRfguPzaNbMiOCP99C65mclW6ErjQLBCnGVHsIXJ3u9pF2aBVYFxydq0-Y3L3k_J_DmndC..; isg=BEFBuCe_xPYC2ylbixd6fqTdUI1bbrVgmgaBTaOWJ8inimBc6r8BMCaIbP7Mgk2Y',
}
params = (
('spm', 'a21bo.jianhua.201856-taobao-item.2'),
)
r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
# 将爬取内容格式化
def parsePage(ils, html):
try:
p = re.findall(r'"view_price":"[\d.]*"', html) # 正则表达式的应用
t = re.findall(r'"raw_title":".*?"', html)
for i in range(len(p)):
price = eval(p[i].split(':')[1])
title = eval(t[i].split(':')[1])
ils.append([price, title])
except:
print("")
# 爬取内容格式化输出
def printGoodsList(ils):
temp = "{0:^6}\t{1:^8}\t{2:{3}^16}" # format格式定义
print(temp.format("序号", "价格", "商品名称", chr(12288)))
count = 0
for l in ils:
count = count + 1
print(temp.format(count, l[0], l[1], chr(12288)))
def main():
while True:
sth = input("请输入要查找的商品名称:")
pages = int(input("请输入要爬取的页面数:")) # 不建议爬取页面过多,以免造成页面崩溃
aurl = 'https://s.taobao.com/search?q=' + sth
inlist = []
for i in range(pages): # 对爬取每一页遍历,然后对每一页进行单一处理
try:
url = aurl + '&s=' + str(44 * i) # 以爬取的每一页的URL进行访问爬取
html = getHTMLText(url)
parsePage(inlist, html)
except:
continue
printGoodsList(inlist)
a = input('是否继续进行商品比价y/n')
if a == 'y' or a == 'Y':
continue
else:
break
main()
print('程序结束')
while True: # exe文件保持打开
pass
Recommend
-
75
大家好,说起爬虫相信很多程序员都听到过,简单来讲就是自动批量抓网络上信息的程序。接下来我结合github上一个爬虫框架NetDiscovery进行演示。 1)为什么要用框架? 框架能够帮助我们处理一些基础的、与目标任务没直接联系的工作,让我们专注在目标任务
-
42
上一篇:Java网络爬虫实操(2) 本篇文章主要介绍NetDiscovery框架中pipeline模式的一些实际使用方法。 1) 什么是pipeline pipeline是一种常见的算法模式,针对不断循环的耗时任务,如果要等一个循环结束后再轮到处理下一个任
-
26
作者 | 陈熹 来源 | 早起Python 一、前言 很多...
-
13
Python爬虫编程思想(6):实战案例:抓取所有的网络资源 ...
-
12
之前我写了一个爬取淘宝商品的源码,给了一个小伙子学习,本想着后面写成文章分享给大家学习的,但没成想被那个小伙子捷足先登了…今天还是拿出来分享给大伙! 是这样的,之前接了一个金主的单子,他想在淘宝开个小鱼零食的网店,想对目前这个市场上的商...
-
7
实操亚马逊批量上传商品-跨境头条-AMZ123亚马逊导航-跨境电商出海门户 实操亚马逊批量上传商品 ...
-
5
1. 讲故事 上个月有位朋友wx找到我,说他的程序存在内存泄漏问题,寻求如何解决? 如下图所示: 从截图中可以看出,这位朋友对 windbg 的操作还是有些熟悉的,可能缺乏一定的实操经验,所以用了几个命令之后就不知道怎么排查下去了。 既...
-
8
亚马逊广告实操—手把手教你投放定向商品广告
-
6
会员精准营销案例之淘宝88vip会员...
-
13
1 问题描述 设计一个简易售票系统,可循环购票并显示余票。 2 功能拆解 2.1 什么是prettytable? PrettyTable 是python中的一个第三方库,可用来生成美观的ASCII格式的表格,本实操案例用此库完成。...
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK