39

python实操案例__03--python定向爬虫之淘宝商品比价

 2 years ago
source link: https://blog.51cto.com/husheng/5914902
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
neoserver,ios ssh client

写在前面,本教程仅为技术学习与交流使用,禁止恶意使用。

1基本步骤

1.1 对淘宝网页进行提取

模拟浏览器免登录进入


def getHTMLText(url):
    try:  # 由于淘宝的防爬虫,所以将request对象中相应替换为以下headers,params
        headers = {
            'authority': 's.taobao.com',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Chromium";v="94", "Microsoft Edge";v="94", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://s.taobao.com/search?q=%E6%9C%BA%E8%BD%A6%E7%9A%AE%E8%A1%A3%E7%94%B7&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&hintq=1',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'cookie': 'cna=OMhiGRd+/AICAd0LFGYLoihm; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; tracknick=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; enc=UV4uq00pRvAS115Dn7DthWOwe5D6AV9nHQXsVJch3hCixytTM%2Bnfkk3MPgv5mvNKP1kKe11aMri5gJujKX2Iuw%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _uab_collina=163082781604428076180674; t=81d7616e71cef921f6e94e7cb2efd9bc; _m_h5_tk=666c5a1d9aad7068d38f9a0d00ce0909_1634103946468; _m_h5_tk_enc=e4b707ff8bc5ce5590f280185ee1c7a1; _samesite_flag_=true; cookie2=1ae68c702f2d87e883690b227152953a; _tb_token_=30f4e95e774e; xlly_s=1; sgcookie=E100X1xCqb64%2Fty3tobC5vP%2BnrHdP0rXjgV4Jwzf6Ts4UrfERgvoF0JnmAty4rUziGRfo141AGq%2FwaLuRhrP9iByeSBRUje%2FA5EnuRBrorSG81o%3D; unb=2203092264; uc3=lg2=WqG3DMC9VAQiUQ%3D%3D&nk2=UUzw3b%2BKiyHHjw%3D%3D&vt3=F8dCujXCVHUPejzVA1o%3D&id2=UUphyd6OaQNQaQ%3D%3D; csg=7db71943; lgc=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cancelledSubSites=empty; cookie17=UUphyd6OaQNQaQ%3D%3D; dnk=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; skt=7b7a5db42324a37f; existShop=MTYzNDA5Mzg3Ng%3D%3D; uc4=nk4=0%40U24vxrAuiizCBWcoLJ%2BIVGaWd%2BS7&id4=0%40U2grEhApMOjkdfij7K7bulTGrFdl; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=%E7%9A%844e; _nk_=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cookie1=Vvkji3ni8lQVma%2BVcRJoWHkPYonhXUjbfTyhLtxA3Oo%3D; mt=ci=51_1; uc1=existShop=false&cookie14=Uoe3c9wTS7kS9w%3D%3D&pas=0&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=URm48syIIVrSKA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; x5sec=7b227365617263686170703b32223a226133313436303963336336316566653863373538633839623362336434393432434b57616d597347454a72336c756a4a6a50576c6e414561444449794d444d774f5449794e6a51374d5367434d4b6546677037382f2f2f2f2f77453d227d; JSESSIONID=EB6E59140A45846FC76B69D8D13C106B; tfstk=cdyGBiXin5l_V8h3NOM_Au1hR1QRZW5Z5-yUL84dja_EiXwFirAeaACdxVQ57I1..; l=eBr9ZXBIgjdDL77zBOfZnurza77TQIRfguPzaNbMiOCP99C65mclW6ErjQLBCnGVHsIXJ3u9pF2aBVYFxydq0-Y3L3k_J_DmndC..; isg=BEFBuCe_xPYC2ylbixd6fqTdUI1bbrVgmgaBTaOWJ8inimBc6r8BMCaIbP7Mgk2Y',
        }
 
        params = (
            ('spm', 'a21bo.jianhua.201856-taobao-item.2'),
        )
        r = requests.get(url, headers=headers, params=params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

1.2 将爬取内容格式化

为方面输出查看,将爬取内容格式化

def parsePage(ils, html):
    try:
        p = re.findall(r'"view_price":"[\d.]*"', html)  # 正则表达式的应用
        t = re.findall(r'"raw_title":".*?"', html)
        for i in range(len(p)):
            price = eval(p[i].split(':')[1])
            title = eval(t[i].split(':')[1])
            ils.append([price, title])
    except:
        print("")
 

1.3 输出查看

def printGoodsList(ils):
    temp = "{0:^6}\t{1:^8}\t{2:{3}^16}"  # format格式定义
    print(temp.format("序号", "价格", "商品名称", chr(12288)))
    count = 0
    for l in ils:
        count = count + 1
        print(temp.format(count, l[0], l[1], chr(12288)))

1.4 定义主函数调用

def main():
    while True:
        sth = input("请输入要查找的商品名称:")
        pages = int(input("请输入要爬取的页面数:"))  # 不建议爬取页面过多,以免造成页面崩溃
        aurl = 'https://s.taobao.com/search?q=' + sth
        inlist = []
        for i in range(pages):  # 对爬取每一页遍历,然后对每一页进行单一处理
            try:
                url = aurl + '&s=' + str(44 * i)  # 以爬取的每一页的URL进行访问爬取
                html = getHTMLText(url)
                parsePage(inlist, html)
            except:
                continue
        printGoodsList(inlist)
        a = input('是否继续进行商品比价y/n')
        if a == 'y' or a == 'Y':
            continue
        else:
            break

2 完整代码

import requests
import re
 
 
# 由于直接用re库findall函数直接匹配,所以直接跳过网页解析,故不用BeautifulSoup库
# 淘宝网页提取
def getHTMLText(url):
    try:  # 由于淘宝的防爬虫,所以将request对象中相应替换为以下headers,params
        headers = {
            'authority': 's.taobao.com',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Chromium";v="94", "Microsoft Edge";v="94", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://s.taobao.com/search?q=%E6%9C%BA%E8%BD%A6%E7%9A%AE%E8%A1%A3%E7%94%B7&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&hintq=1',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'cookie': 'cna=OMhiGRd+/AICAd0LFGYLoihm; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; tracknick=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; enc=UV4uq00pRvAS115Dn7DthWOwe5D6AV9nHQXsVJch3hCixytTM%2Bnfkk3MPgv5mvNKP1kKe11aMri5gJujKX2Iuw%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _uab_collina=163082781604428076180674; t=81d7616e71cef921f6e94e7cb2efd9bc; _m_h5_tk=666c5a1d9aad7068d38f9a0d00ce0909_1634103946468; _m_h5_tk_enc=e4b707ff8bc5ce5590f280185ee1c7a1; _samesite_flag_=true; cookie2=1ae68c702f2d87e883690b227152953a; _tb_token_=30f4e95e774e; xlly_s=1; sgcookie=E100X1xCqb64%2Fty3tobC5vP%2BnrHdP0rXjgV4Jwzf6Ts4UrfERgvoF0JnmAty4rUziGRfo141AGq%2FwaLuRhrP9iByeSBRUje%2FA5EnuRBrorSG81o%3D; unb=2203092264; uc3=lg2=WqG3DMC9VAQiUQ%3D%3D&nk2=UUzw3b%2BKiyHHjw%3D%3D&vt3=F8dCujXCVHUPejzVA1o%3D&id2=UUphyd6OaQNQaQ%3D%3D; csg=7db71943; lgc=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cancelledSubSites=empty; cookie17=UUphyd6OaQNQaQ%3D%3D; dnk=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; skt=7b7a5db42324a37f; existShop=MTYzNDA5Mzg3Ng%3D%3D; uc4=nk4=0%40U24vxrAuiizCBWcoLJ%2BIVGaWd%2BS7&id4=0%40U2grEhApMOjkdfij7K7bulTGrFdl; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=%E7%9A%844e; _nk_=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cookie1=Vvkji3ni8lQVma%2BVcRJoWHkPYonhXUjbfTyhLtxA3Oo%3D; mt=ci=51_1; uc1=existShop=false&cookie14=Uoe3c9wTS7kS9w%3D%3D&pas=0&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=URm48syIIVrSKA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; x5sec=7b227365617263686170703b32223a226133313436303963336336316566653863373538633839623362336434393432434b57616d597347454a72336c756a4a6a50576c6e414561444449794d444d774f5449794e6a51374d5367434d4b6546677037382f2f2f2f2f77453d227d; JSESSIONID=EB6E59140A45846FC76B69D8D13C106B; tfstk=cdyGBiXin5l_V8h3NOM_Au1hR1QRZW5Z5-yUL84dja_EiXwFirAeaACdxVQ57I1..; l=eBr9ZXBIgjdDL77zBOfZnurza77TQIRfguPzaNbMiOCP99C65mclW6ErjQLBCnGVHsIXJ3u9pF2aBVYFxydq0-Y3L3k_J_DmndC..; isg=BEFBuCe_xPYC2ylbixd6fqTdUI1bbrVgmgaBTaOWJ8inimBc6r8BMCaIbP7Mgk2Y',
        }
 
        params = (
            ('spm', 'a21bo.jianhua.201856-taobao-item.2'),
        )
        r = requests.get(url, headers=headers, params=params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
 
# 将爬取内容格式化
def parsePage(ils, html):
    try:
        p = re.findall(r'"view_price":"[\d.]*"', html)  # 正则表达式的应用
        t = re.findall(r'"raw_title":".*?"', html)
        for i in range(len(p)):
            price = eval(p[i].split(':')[1])
            title = eval(t[i].split(':')[1])
            ils.append([price, title])
    except:
        print("")
 
 
# 爬取内容格式化输出
def printGoodsList(ils):
    temp = "{0:^6}\t{1:^8}\t{2:{3}^16}"  # format格式定义
    print(temp.format("序号", "价格", "商品名称", chr(12288)))
    count = 0
    for l in ils:
        count = count + 1
        print(temp.format(count, l[0], l[1], chr(12288)))
 
 
def main():
    while True:
        sth = input("请输入要查找的商品名称:")
        pages = int(input("请输入要爬取的页面数:"))  # 不建议爬取页面过多,以免造成页面崩溃
        aurl = 'https://s.taobao.com/search?q=' + sth
        inlist = []
        for i in range(pages):  # 对爬取每一页遍历,然后对每一页进行单一处理
            try:
                url = aurl + '&s=' + str(44 * i)  # 以爬取的每一页的URL进行访问爬取
                html = getHTMLText(url)
                parsePage(inlist, html)
            except:
                continue
        printGoodsList(inlist)
        a = input('是否继续进行商品比价y/n')
        if a == 'y' or a == 'Y':
            continue
        else:
            break
 
 
main()
print('程序结束')
while True:  # exe文件保持打开
    pass

Recommend

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK