10

Python多线程下载黑白网学习资源库文件

 3 years ago
source link: https://www.heibai.org/post/1991.html
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
live.png

黑白网(heibai.org)成立于2014年,多年来以其专业的视角,优质的服务为广大安全技术爱好者提供了目前国内最全的网络安全技术学习资料,普及中国网络安全知识,宣扬正确的黑客极客文化,全方面提高国内安全技术水平。

前几天(周),发现个信息安全的资源网站黑白网,看到一堆关于信息安全的资料。啊这,像我这种看到资料就想收藏的人那里忍得了,奈何资料有点多,就写个Python3脚本挂着下载

主要是官网显示居然要今年取消一切服务,这还不下载保存??

原始脚本【无多线程】

ActionScript
import requestsimport reimport timefrom pathlib import Path

url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\信息安全\\\\test\\'#下载路径req = requests.get(url)a = re.findall(r"<a href = '\./\./(.*)' target = ", req.text)with open('heibai.txt', 'w+', encoding='utf8') as f:
    for i in a:
        f.write(i+'\n')
    f.close()def mkdir(path):
    import os
    path = path.strip()
    isExists = os.path.exists(path)
    if not isExists:
        os.makedirs(path)
        return True
    else:
        return Falsewith open('heibai.txt', 'r', encoding='utf8') as f:
    for i in f:
        i = i[0:-1]
        my_file = Path(path+i)
        if not my_file.exists():
            if '/' in i:
                new = re.findall(r"(.*)/", i)
                mkdir(path+new[0])
            print('Downloading:'+i+' '+time.asctime(time.localtime(time.time())))
            r = requests.get(url2+i)
            with open(path+i, "wb") as code:
                code.write(r.content)
    print("Finnish!")
    f.close()

挂着下载到本地,后来发现是真的慢,就考虑多线程下载??

多线程脚本

本来没学过多线程,临时抱佛脚学了一会原理,然后就直接搬网上的脚本改了一下

ActionScript
#! -coding:utf8 -*-import threading,sysimport requestsimport timeimport osimport refrom pathlib import Path

url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\信息安全\\安全书及笔记\\heibai\\'#下载路径def txt():
    req = requests.get(url)
    a = re.findall(r"<a href = '\./\./(.*)' target = ", req.text)
    with open('heibai.txt', 'w+', encoding='utf8') as f:
        for i in a:
            f.write(i+'\n')
        f.close()def mkdir(path):
    import os
    path = path.strip()
    isExists = os.path.exists(path)
    if not isExists:
        os.makedirs(path)
        return True
    else:
        return Falseclass MulThreadDownload(threading.Thread):
    def __init__(self,url,startpos,endpos,f):
        super(MulThreadDownload,self).__init__()
        self.url = url
        self.startpos = startpos
        self.endpos = endpos
        self.fd = f    def download(self):
        #print("start thread:%s at %s" % (self.getName(), time.time()))
        headers = {"Range":"bytes=%s-%s"%(self.startpos,self.endpos)}
        res = requests.get(self.url,headers=headers)
        self.fd.seek(self.startpos)
        self.fd.write(res.content)
        #print("stop thread:%s at %s" % (self.getName(), time.time()))
        # f.close()

    def run(self):
        self.download()Blacklist = ['思维导图/移动安全/.DS_Store',]txt()with open('heibai.txt', 'r', encoding='utf8') as f:
    for filename in f:
        filename = filename[0:-1]
        if filename in Blacklist:
            continue
        my_file = Path(path+filename)
        if not my_file.exists():
            if '/' in filename:
                new = re.findall(r"(.*)/", filename)
                mkdir(path+new[0])
            while 1:
                try:
                    filesize = int(requests.head(url2+filename).headers['Content-Length'])
                except Exception:
                    print('10分钟后重试连接服务器!')
                    time.sleep(60*10)
                    continue
                break
            print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))
            #线程数
            threadnum = 2
            #信号量,同时只允许2个线程运行
            threading.BoundedSemaphore(threadnum)
            # 默认2线程现在,也可以通过传参的方式设置线程数
            step = filesize // threadnum
            mtd_list = []
            start = 0
            end = -1

            # 请空并生成文件
            tempf = open(path+filename,'w')
            tempf.close()
            # rb+ ,二进制打开,可任意位置读写
            with open(path+filename,'rb+') as  f:
                fileno = f.fileno()
                # 如果文件大小为11字节,那就是获取文件0-10的位置的数据。如果end = 10,说明数据已经获取完了。
                while end < filesize -1:
                    start = end +1
                    end = start + step -1
                    if end > filesize:
                        end = filesize                    # print("start:%s, end:%s"%(start,end))
                    # 复制文件句柄
                    dup = os.dup(fileno)
                    # print(dup)
                    # 打开文件
                    fd = os.fdopen(dup,'rb+',-1)
                    # print(fd)
                    t = MulThreadDownload(url,start,end,fd)
                    t.start()
                    mtd_list.append(t)

                for i in  mtd_list:
                    i.join()

这是 闲的慌,硬盘闲的大 写的有点乱脚本
思维导图/移动安全/.DS_Store这文件被网站拦截,不能下载
似乎多线程太快了,服务器把我ip给ban了一会,所以是否使用多线程脚本看自己的网速
后来多线程中加入了延时重试连接服务器,应该可以晚上挂着下载了

我只在脚本的多线程下载部分加入重连,故如果脚本运行前就被banIP,则脚本报错

原始脚本下载一段时间,服务器会超时,脚本会卡住
再后来,直接在文件下载之间睡眠5s,好像效果还行

ActionScript
if not my_file.exists():
            if '/' in filename:
                new = re.findall(r"(.*)/", filename)
                mkdir(path+new[0])
            print('防止被ban,暂停5s中...')
            time.sleep(5)
            while 1:
                try:
                    filesize = int(requests.head(url2+filename).headers['Content-Length'])
                except Exception:
                    print('10分钟后重试连接服务器!')
                    time.sleep(60*10)
                    continue
                break
            print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))

   原文链接                                                    
http://www.dtmao.cc/news_show_755626.shtml


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK