7

宅男福利 用Python爬取美女图片

 3 years ago
source link: https://blog.csdn.net/Miku_wx/article/details/112169093
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

宅男福利 用Python爬取美女图片

嘿嘿 召唤老色批
今天带大家爬去一下美女的图片

用的是requests和xpath去解析

获取网页和解析网页的函数

def get_tag(response,tag):
    html=etree.HTML(response)
    ret=html.xpath(tag)
    return ret

def parse_url(url):
    response=requests.get(url,headers=headers)
    return response.text

获取网页url

def url_find(url):
    r=parse_url(url)
    url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
    title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
    # print(len(url_list))
    for i in range(len(url_list)):
        url_jpg_find(url_list[i],title[i])
        print(title,'保存完毕')

获取图片的url

def url_jpg_find(url,title):
    global page
    page=0
    r=parse_url(url)
    url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
    url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
    if not os.path.exists(title):
        os.makedirs(title)
    # else:
    #     return
    for i in url_list:
        content_find(i,title)
        # break

获取图片的信息

def content_find(url,title):
    # print(url)
    r=parse_url(url)
    # print(r)
    name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
    url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
    # print(name,url_jpg)
    time.sleep(0.2)
    save(name,url_jpg,title)
def save(name,url_jpg,title):
    global page
    r=requests.get(url_jpg,headers=headers)
    with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
        j.write(r.content)
    j.close()
    page+=1
    print(page)

import requests,os,time
from lxml import etree

headers={
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Referer" : "https://www.mzitu.com",
}

page=0

def get_tag(response,tag):
    html=etree.HTML(response)
    ret=html.xpath(tag)
    return ret

def parse_url(url):
    response=requests.get(url,headers=headers)
    return response.text

def url_find(url):
    r=parse_url(url)
    url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
    title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
    # print(len(url_list))
    for i in range(len(url_list)):
        url_jpg_find(url_list[i],title[i])
        print(title,'保存完毕')

def url_jpg_find(url,title):
    global page
    page=0
    r=parse_url(url)
    url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
    url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
    if not os.path.exists(title):
        os.makedirs(title)
    # else:
    #     return
    for i in url_list:
        content_find(i,title)
        # break

def content_find(url,title):
    # print(url)
    r=parse_url(url)
    # print(r)
    name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
    url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
    # print(name,url_jpg)
    time.sleep(0.2)
    save(name,url_jpg,title)

def save(name,url_jpg,title):
    global page
    r=requests.get(url_jpg,headers=headers)
    with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
        j.write(r.content)
    j.close()
    page+=1
    print(page)

def main():
    start_url='https://www.mzitu.com'
    r=parse_url(start_url)
    url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[3]/div/a[4]/text()')[0])
    url='https://www.mzitu.com/page/'
    url_list=['https://www.mzitu.com']+[url+str(i) for i in range(2,url_last+1)]
    # print(url_list)
    for url in url_list:
        url_find(url)
        # break


if __name__ == '__main__':
    main()

效果图就不放了
咳咳 太诱人 会被封掉
请大家自行脑补一下

一起学习python,小白指导,教学分享记得私信我


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK