7
宅男福利 用Python爬取美女图片
source link: https://blog.csdn.net/Miku_wx/article/details/112169093
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
宅男福利 用Python爬取美女图片
嘿嘿 召唤老色批
今天带大家爬去一下美女的图片
用的是requests和xpath去解析
获取网页和解析网页的函数
def get_tag(response,tag):
html=etree.HTML(response)
ret=html.xpath(tag)
return ret
def parse_url(url):
response=requests.get(url,headers=headers)
return response.text
获取网页url
def url_find(url):
r=parse_url(url)
url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
# print(len(url_list))
for i in range(len(url_list)):
url_jpg_find(url_list[i],title[i])
print(title,'保存完毕')
获取图片的url
def url_jpg_find(url,title):
global page
page=0
r=parse_url(url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
if not os.path.exists(title):
os.makedirs(title)
# else:
# return
for i in url_list:
content_find(i,title)
# break
获取图片的信息
def content_find(url,title):
# print(url)
r=parse_url(url)
# print(r)
name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
# print(name,url_jpg)
time.sleep(0.2)
save(name,url_jpg,title)
def save(name,url_jpg,title):
global page
r=requests.get(url_jpg,headers=headers)
with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
j.write(r.content)
j.close()
page+=1
print(page)
import requests,os,time
from lxml import etree
headers={
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Referer" : "https://www.mzitu.com",
}
page=0
def get_tag(response,tag):
html=etree.HTML(response)
ret=html.xpath(tag)
return ret
def parse_url(url):
response=requests.get(url,headers=headers)
return response.text
def url_find(url):
r=parse_url(url)
url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
# print(len(url_list))
for i in range(len(url_list)):
url_jpg_find(url_list[i],title[i])
print(title,'保存完毕')
def url_jpg_find(url,title):
global page
page=0
r=parse_url(url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
if not os.path.exists(title):
os.makedirs(title)
# else:
# return
for i in url_list:
content_find(i,title)
# break
def content_find(url,title):
# print(url)
r=parse_url(url)
# print(r)
name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
# print(name,url_jpg)
time.sleep(0.2)
save(name,url_jpg,title)
def save(name,url_jpg,title):
global page
r=requests.get(url_jpg,headers=headers)
with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
j.write(r.content)
j.close()
page+=1
print(page)
def main():
start_url='https://www.mzitu.com'
r=parse_url(start_url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[3]/div/a[4]/text()')[0])
url='https://www.mzitu.com/page/'
url_list=['https://www.mzitu.com']+[url+str(i) for i in range(2,url_last+1)]
# print(url_list)
for url in url_list:
url_find(url)
# break
if __name__ == '__main__':
main()
效果图就不放了
咳咳 太诱人 会被封掉
请大家自行脑补一下
一起学习python,小白指导,教学分享记得私信我
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK