22
「docker实战篇」python的docker- 抖音视频抓取(中)(25)
source link: https://idig8.com/2019/04/13/dockershizhanpianpythondedocker-douyinshipinzhuaquzhong25/?amp%3Butm_medium=referral
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
本次主要针对python对上次抖音分享的页面中的_signature进行解析并完成抖音视频的下载。源码:https://github.com/limingios/dockerpython.git (源码/「「docker实战篇」python的docker- 抖音视频抓取(下)(24))
https://github.com/limingios/dockerpython.git (谷歌插件)
找到方法,完成本地的html的生成
其实就是复制出来分享页面的函数,然后通过函数,调用的方式完成_signature的生成。
html_foot.txt
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> </body> </html> <script type="text/javascript">
html_foot.txt
!function(t) { if (t.__M = t.__M || {}, !t.__M.require) { var e, n, r = document.getElementsByTagName("head")[0], i = {}, o = {}, a = {}, u = {}, c = {}, s = {}, l = function(t, n) { if (!(t in u)) { u[t] = !0; var i = document.createElement("script"); if (n) { var o = setTimeout(n, e.timeout); i.onerror = function() { clearTimeout(o), n() } ; var a = function() { clearTimeout(o) }; "onload"in i ? i.onload = a : i.onreadystatechange = function() { ("loaded" === this.readyState || "complete" === this.readyState) && a() } } return i.type = "text/javascript", i.src = t, r.appendChild(i), i } }, f = function(t, e, n) { var r = i[t] || (i[t] = []); r.push(e); var o, a = c[t] || c[t + ".js"] || {}, u = a.pkg; o = u ? s[u].url || s[u].uri : a.url || a.uri || t, l(o, n && function() { n(t) } ) }; n = function(t, e) { "function" != typeof e && (e = arguments[2]), t = t.replace(/\.js$/i, ""), o[t] = e; var n = i[t]; if (n) { for (var r = 0, a = n.length; a > r; r++) n[r](); delete i[t] } } , e = function(t) { if (t && t.splice) return e.async.apply(this, arguments); t = e.alias(t); var n = a[t]; if (n) return n.exports; var r = o[t]; if (!r) throw "[ModJS] Cannot find module `" + t + "`"; n = a[t] = { exports: {} }; var i = "function" == typeof r ? r.apply(n, [e, n.exports, n]) : r; return i && (n.exports = i), n.exports && !n.exports["default"] && Object.defineProperty && Object.isExtensible(n.exports) && Object.defineProperty(n.exports, "default", { value: n.exports }), n.exports } , e.async = function(n, r, i) { function a(t) { for (var n, r = 0, h = t.length; h > r; r++) { var p = e.alias(t[r]); p in o ? (n = c[p] || c[p + ".js"], n && "deps"in n && a(n.deps)) : p in s || (s[p] = !0, l++, f(p, u, i), n = c[p] || c[p + ".js"], n && "deps"in n && a(n.deps)) } } function u() { if (0 === l--) { for (var i = [], o = 0, a = n.length; a > o; o++) i[o] = e(n[o]); r && r.apply(t, i) } } "string" == typeof n && (n = [n]); var s = {} , l = 0; a(n), u() } , e.resourceMap = function(t) { var e, n; n = t.res; for (e in n) n.hasOwnProperty(e) && (c[e] = n[e]); n = t.pkg; for (e in n) n.hasOwnProperty(e) && (s[e] = n[e]) } , e.loadJs = function(t) { l(t) } , e.loadCss = function(t) { if (t.content) { var e = document.createElement("style"); e.type = "text/css", e.styleSheet ? e.styleSheet.cssText = t.content : e.innerHTML = t.content, r.appendChild(e) } else if (t.url) { var n = document.createElement("link"); n.href = t.url, n.rel = "stylesheet", n.type = "text/css", r.appendChild(n) } } , e.alias = function(t) { return t.replace(/\.js$/i, "") } , e.timeout = 5e3, t.__M.define = n, t.__M.require = e } }(this) __M.define("douyin_falcon:node_modules/byted-acrawler/dist/runtime", function(l, e) { Function(function(l) { return 'e(e,a,r){(b[e]||(b[e]=t("x,y","x "+e+" y")(r,a)}a(e,a,r){(k[r]||(k[r]=t("x,y","new x[y]("+Array(r+1).join(",x[y]")(1)+")")(e,a)}r(e,a,r){n,t,s={},b=s.d=r?r.d+1:0;for(s["$"+b]=s,t=0;t<b;t)s[n="$"+t]=r[n];for(t=0,b=s=a;t<b;t)s[t]=a[t];c(e,0,s)}c(t,b,k){u(e){v[x]=e}f{g=,ting(bg)}l{try{y=c(t,b,k)}catch(e){h=e,y=l}}for(h,y,d,g,v=[],x=0;;)switch(g=){case 1:u(!)4:f5:u((e){a=0,r=e;{c=a<r;c&&u(e[a]),c}}(6:y=,u((y8:if(g=,lg,g=,y===c)b+=g;else if(y!==l)y9:c10:u(s(11:y=,u(+y)12:for(y=f,d=[],g=0;g<y;g)d[g]=y.charCodeAt(g)^g+y;u(String.fromCharCode.apply(null,d13:y=,h=delete [y]14:59:u((g=)?(y=x,v.slice(x-=g,y:[])61:u([])62:g=,k[0]=65599*k[0]+k[1].charCodeAt(g)>>>065:h=,y=,[y]=h66:u(e(t[b],,67:y=,d=,u((g=).x===c?r(g.y,y,k):g.apply(d,y68:u(e((g=t[b])<"<"?(b--,f):g+g,,70:u(!1)71:n72:+f73:u(parseInt(f,3675:if(){bcase 74:g=<<16>>16g76:u(k[])77:y=,u([y])78:g=,u(a(v,x-=g+1,g79:g=,u(k["$"+g])81:h=,[f]=h82:u([f])83:h=,k[]=h84:!085:void 086:u(v[x-1])88:h=,y=,h,y89:u({e{r(e.y,arguments,k)}e.y=f,e.x=c,e})90:null91:h93:h=0:;default:u((g<<16>>16)-16)}}n=this,t=n.Function,s=Object.keys||(e){a={},r=0;for(c in e)a[r]=c;a=r,a},b={},k={};r'.replace(/[-]/g, function(e) { return l[15 & e.charCodeAt(0)] }) }("v[x++]=v[--x]t.charCodeAt(b++)-32function return ))++.substrvar .length(),b+=;break;case ;break}".split("")))()('gr$Daten Иb/s!l y͒yĹg,(lfi~ah`{mv,-n|jqewVxp{rvmmx,&effkx[!cs"l".Pq%widthl"@q&heightl"vr*getContextx$"2d[!cs#l#,*;?|u.|uc{uq$fontl#vr(fillTextx$$龘ฑภ경2<[#c}l#2q*shadowBlurl#1q-shadowOffsetXl#$$limeq+shadowColorl#vr#arcx88802[%c}l#vr&strokex[ c}l"v,)}eOmyoZB]mx[ cs!0s$l$Pb<k7l l!r&lengthb%^l$1+s$jl s#i$1ek1s$gr#tack4)zgr#tac$! +0o![#cj?o ]!l$b%s"o ]!l"l$b*b^0d#>>>s!0s%yA0s"l"l!r&lengthb<k+l"^l"1+s"jl s&l&z0l!$ +["cs\'(0l#i\'1ps9wxb&s() &{s)/s(gr&Stringr,fromCharCodes)0s*yWl ._b&s o!])l l Jb<k$.aj;l .Tb<k$.gj/l .^b<k&i"-4j!+& s+yPo!]+s!l!l Hd>&l!l Bd>&+l!l <d>&+l!l 6d>&+l!l &+ s,y=o!o!]/q"13o!l q"10o!],l 2d>& s.{s-yMo!o!]0q"13o!]*Ld<l 4d#>>>b|s!o!l q"10o!],l!& s/yIo!o!].q"13o!],o!]*Jd<l 6d#>>>b|&o!]+l &+ s0l-l!&l-l!i\'1z141z4b/@d<l"b|&+l-l(l!b^&+l-l&zl\'g,)gk}ejo{cm,)|yn~Lij~em["cl$b%@d<l&zl\'l $ +["cl$b%b|&+l-l%8d<@b|l!b^&+ q$sign ', [Object.defineProperty(e, "__esModule", { value: !0 })]) }); dycs = __M.require("douyin_falcon:node_modules/byted-acrawler/dist/runtime") signc = dycs.sign(&&&&) document.title = signc document.write(signc) </script>
handle_douyin_movie.py 下载代码
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/2/20 17:39 # @Author : Aries # @Site : # @File : handle_douyin_movie.py.py # @Software: PyCharm import json import os import requests import re import time from selenium import webdriver from selenium.webdriver.chrome.options import Options #分享ID share_id = "89923219116" share_url = "https://www.douyin.com/share/user/"+share_id header = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36" } #dytk 和tac的正则表达式 dytk_search = re.compile(r"dytk: '(.*?)'") tac_search = re.compile(r"<script>tac=(.*?)</script>") response = requests.get(url=share_url,headers=header) #处理获取dytk 和tac dytk = re.search(dytk_search,response.text).group(1) tac = re.search(tac_search,response.text).group(1) #tac封装成为js的格式 tac = "var tac="+tac+";" # html页面的编写合成 header + tac+ foot with open("html_head.txt") as f1: f1_read = f1.read() with open("html_foot.txt") as f2: f2_read = f2.read().replace("&&&&","89923219116") with open("test.html","w") as f_w: f_w.write(f1_read+"\n"+tac+"\n"+f2_read) # signature = input("秘钥为:") chrome_options = Options() chrome_options.add_argument("--headless") abspath = os.path.abspath(r"D:\Program Files\chromedriver\chromedriver.exe") douyin_driver = webdriver.Chrome(executable_path=abspath,chrome_options=chrome_options,) douyin_driver.get("file:///E:\\dockerpython\\python\\douyin\\test.html") signature = douyin_driver.title douyin_driver.quit() movie_url = "https://www.douyin.com/aweme/v1/aweme/post/?user_id="+share_id+"&count=21&max_cursor=0&aid=1128&_signature="+signature+"&dytk="+dytk #接口不太稳定,所以要使用while循环一直调用 while True: movie_reponse = requests.get(url=movie_url,headers=header) if json.loads(movie_reponse.text)["aweme_list"] == []: #time.sleep(1) continue else: print(movie_reponse.text) for item in json.loads(movie_reponse.text)["aweme_list"]: video_url = item["video"]["play_addr"]["url_list"][0] video_response = requests.get(url=video_url,headers=header) with open("douyin.mp4","wb") as v: #不能使用video_response.text,必须使用content才可以把内容写进去 v.write(video_response.content) break
最终结果
里面关于chromedriver的配置直接引入他的路径最稳了,我比较喜欢这种方式网上很多搞环境变量的导致电脑很慢不建议。
PS:基本上抖音视频下载的都已经完成了,下次对于需要注意的做下总结。
>>原创文章,欢迎转载。转载请注明:转载自IT人故事会,谢谢!
>>原文链接地址:上一篇:已是最新文章
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK