

基于golang实现代理IP抓取实例
source link: https://studygolang.com/articles/32399
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

"github.com/imroc/req" "github.com/PuerkitoBio/goquery"
实例简单,直接上code
/*
golang 的代理ip采集
*/
package main
import (
"fmt"
"math/rand"
"time"
"strings"
"regexp"
"database/sql"
"github.com/imroc/req"
"github.com/PuerkitoBio/goquery"
)
//全局请求头
var USER_AGENTS []string = []string{"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
}
//get请求
func GetData(url string) (string,int) {
rand.Seed(time.Now().UnixNano())
i := rand.Intn(34)
//req.SetProxyUrl("http://41.169.158.90:8080")//设置代理
req.SetTimeout(20 * time.Second)//设置超时
r, err := req.Get(url,req.Header{"User-Agent": USER_AGENTS[i],
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Accept-Encoding": "gzip, deflate",
})
if err != nil {
fmt.Println("请求错误 : ",err)
return fmt.Sprintf("请求错误 : %s",err),0
}
resp := r.Response()
return r.String(),resp.StatusCode
}
// nimadaili 代理
// http://www.nimadaili.com
func GetNimadailiIP(){
url := "http://www.nimadaili.com"
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
ipsList := make([]string,0)
dom.Find("#overflow table tbody tr").Each(func(i int, selection *goquery.Selection) {
//fmt.Println(selection.Text())
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
//fmt.Println(tdList,len(tdList))
if len(tdList) == 9{
ip := tdList[1]
ip = strings.Replace(ip, " ", "", -1)
iptype := tdList[3]
iptype = strings.Replace(iptype, " ", "", -1)
//fmt.Println(ip,iptype)
iptypelist := strings.Split(iptype, ",")
if len(iptypelist) >1{
iptypeStr1 := strings.ToLower(iptypelist[0])
iptypeStr2 := strings.ToLower(iptypelist[1])
ips1 := fmt.Sprintf("%s://%s",iptypeStr1,ip)
ips2 := fmt.Sprintf("%s://%s",iptypeStr2,ip)
ipsList = append(ipsList,ips1)
ipsList = append(ipsList,ips2)
}else{
iptypeStr := strings.ToLower(iptype)
ips := fmt.Sprintf("%s://%s",iptypeStr,ip)
ipsList = append(ipsList,ips)
}
}
})
fmt.Println(ipsList)
}
// crossincode 代理
// https://lab.crossincode.com/proxy/
func GetCrossincodeIP(){
url := "https://lab.crossincode.com/proxy/"
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
ipsList := make([]string,0)
dom.Find("table tr").Each(func(i int, selection *goquery.Selection) {
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
if len(tdList) == 9{
ip := tdList[1]
ip = strings.Replace(ip, " ", "", -1)
port := tdList[2]
port = strings.Replace(port, " ", "", -1)
ips1 := fmt.Sprintf("http://%s:%s",ip,port)
ips2 := fmt.Sprintf("https://%s:%s",ip,port)
if ip != "Addr"{
ipsList = append(ipsList,ips1)
ipsList = append(ipsList,ips2)
}
}
})
fmt.Println(ipsList)
}
// xiladaili 代理
// http://www.xiladaili.com/
func GetXiladailiIP(){
url := "http://www.xiladaili.com/"
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
ipsList := make([]string,0)
dom.Find("#scroll tr").Each(func(i int, selection *goquery.Selection) {
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
if len(tdList) == 10{
ip := tdList[1]
ip = strings.Replace(ip, " ", "", -1)
iptype := tdList[3]
iptype = strings.Replace(iptype, " ", "", -1)
iptypeStr := strings.ToLower(iptype)
iptypeList := strings.Split(iptypeStr, ",")
for _,v := range(iptypeList){
if ip != "免费ip代理"{
ipsList = append(ipsList,fmt.Sprintf("%s://%s",v,ip))
}
}
}
})
fmt.Println(ipsList)
}
// jiangxianli
// http://ip.jiangxianli.com/?page=1
// page 1~3(原网站会更新,取前三页就好)
func GetJiangxianliIP(){
ipsList := make([]string,0)
for i := 0; i < 5; i++ {
url := fmt.Sprintf("http://ip.jiangxianli.com/?page=%d",i)
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find("tbody tr").Each(func(i int, selection *goquery.Selection) {
//fmt.Println(selection.Text())
rest,_ := selection.Html()
//fmt.Println(rest)
td:=`<td>(.*?)</td>`
tdreg := regexp.MustCompile(td)
tdList := tdreg.FindAllStringSubmatch(rest,-1)
//fmt.Println(tdList[0][1],tdList[1][1],tdList[3][1],"\n\n\n")
ip := tdList[0][1]
port := tdList[1][1]
iptype := tdList[3][1]
iptype = strings.ToLower(iptype)
ipsList = append(ipsList,fmt.Sprintf("%s://%s:%s",iptype,ip,port))
})
}
fmt.Println(ipsList)
}
// superfastip
// http://www.superfastip.com/welcome/freeip/1
// /1 ~ /10
func GetSuperfastipIP(){
ipsList := make([]string,0)
for i := 0; i < 11; i++ {
url := fmt.Sprintf("http://www.superfastip.com/welcome/freeip/%d",i)
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find("tbody tr").Each(func(i int, selection *goquery.Selection) {
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
if len(tdList) == 10{
ip := tdList[1]
ip = strings.Replace(ip, " ", "", -1)
port := tdList[2]
port = strings.Replace(port, " ", "", -1)
iptype := tdList[5]
iptype = strings.Replace(iptype, " ", "", -1)
iptype = strings.ToLower(iptype)
ipsList = append(ipsList,fmt.Sprintf("%s://%s:%s",iptype,ip,port))
}
})
}
fmt.Println(ipsList)
}
// kuaidaili
// https://www.kuaidaili.com/free/inha/1/
// /1 ~ /1000
//需要加代理请求
func GetKuaidailiIP() {
ipsList := make([]string,0)
for i := 1; i < 1000; i++ {
url := fmt.Sprintf("https://www.kuaidaili.com/free/inha/%d/",i)
htmlTxt,statusCode := GetData(url)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find("#list table tbody tr").Each(func(i int, selection *goquery.Selection) {
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
if len(tdList) == 9{
ip := tdList[1]
ip = strings.Replace(ip, " ", "", -1)
port := tdList[2]
port = strings.Replace(port, " ", "", -1)
iptype := tdList[4]
iptype = strings.Replace(iptype, " ", "", -1)
iptype = strings.ToLower(iptype)
ipsList = append(ipsList,fmt.Sprintf("%s://%s:%s",iptype,ip,port))
}
})
}
fmt.Println(ipsList)
}
// w66ip
// http://www.66ip.cn/1.html
// 1~1234
func GetW66ipIP(){
ipsList := make([]string,0)
for i := 0; i < 1234; i++ {
url := fmt.Sprintf("http://www.66ip.cn/%d.html",i)
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find("#main table tr").Each(func(i int, selection *goquery.Selection) {
rest,_ := selection.Html()
td:=`<td>(.*?)</td>`
tdreg := regexp.MustCompile(td)
tdList := tdreg.FindAllStringSubmatch(rest,-1)
ip := tdList[0][1]
port := tdList[1][1]
if ip != "ip"{
ipsList = append(ipsList,fmt.Sprintf("http://%s:%s",ip,port))
ipsList = append(ipsList,fmt.Sprintf("https://%s:%s",ip,port))
}
})
}
fmt.Println(ipsList)
}
// w89ip
// http://www.89ip.cn/index_1.html
// 1~ 6
func GetW89ipIP() {
ipsList := make([]string,0)
for i := 0; i < 7; i++ {
url := fmt.Sprintf("http://www.89ip.cn/index_%d.html",i)
htmlTxt,statusCode := GetData(url)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find(".layui-table tbody tr").Each(func(i int, selection *goquery.Selection) {
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
if len(tdList) == 12{
ip := tdList[2]
ip = strings.Replace(ip, "\n", "", -1)
ip = strings.Replace(ip, "\t", "", -1)
ip = strings.Replace(ip, " ", "", -1)
port := tdList[4]
port = strings.Replace(port, "\n", "", -1)
port = strings.Replace(port, "\t", "", -1)
port = strings.Replace(port, " ", "", -1)
ipsList = append(ipsList,fmt.Sprintf("http://%s:%s",ip,port))
ipsList = append(ipsList,fmt.Sprintf("https://%s:%s",ip,port))
}
})
}
fmt.Println(ipsList)
}
// json89ip
// http://www.89ip.cn/tqdl.html?num=1000
// 需要添加等待时间请求,获取太平凡无意义
func GetJson89ipIP(redis *redis.RedisGo){
ipsList := make([]string,0)
url := fmt.Sprintf("http://www.89ip.cn/tqdl.html?num=2000")
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find(".fly-panel div").Each(func(i int, selection *goquery.Selection) {
rest,_ := selection.Html()
rest = strings.Replace(rest, "\n", "", -1)
rest = strings.Replace(rest, "\t", "", -1)
rest = strings.Replace(rest, " ", "", -1)
ipTxt := strings.Split(rest, "<br/>")
if len(ipTxt) > 1{
for _,v := range(ipTxt[0:len(ipTxt)-1]){
ipsList = append(ipsList,fmt.Sprintf("http://%s",v))
ipsList = append(ipsList,fmt.Sprintf("https://%s",v))
redis.ListRpush(redis.GlobalKey, fmt.Sprintf("https://%s",v))
}
}
//fmt.Println(ipTxt, len(ipTxt), "\n============\n\n")
})
fmt.Println(ipsList)
}
// qydaili
// http://www.qydaili.com/free/?action=china&page=1
// 1~10
func GetQydailiIP(){
ipsList := make([]string,0)
for i := 0; i < 10; i++ {
url := fmt.Sprintf("http://www.qydaili.com/free/?action=china&page=%d",i)
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find(".container table tbody tr").Each(func(i int, selection *goquery.Selection) {
//fmt.Println(selection.Text(),"\n\n\n")
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
//fmt.Println(tdList,len(tdList),"\n============\n\n")
if len(tdList) == 9 {
ip := tdList[1]
ip = strings.Replace(ip, " ", "", -1)
port := tdList[2]
port = strings.Replace(port, " ", "", -1)
iptype := tdList[4]
iptype = strings.Replace(iptype, " ", "", -1)
iptype = strings.ToLower(iptype)
ipsList = append(ipsList,fmt.Sprintf("%s://%s:%s",iptype,ip,port))
}
})
}
fmt.Println(ipsList)
}
// ip3366
// http://www.ip3366.net/free/?stype=1&page=1
// 1~6
func GetIp3366IP(){
ipsList := make([]string,0)
for i := 0; i < 7; i++ {
url := fmt.Sprintf("http://www.ip3366.net/free/?stype=1&page=%d",i)
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find("#list table tbody tr").Each(func(i int, selection *goquery.Selection) {
//fmt.Println(selection.Text(),"\n\n\n")
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
//fmt.Println(tdList,len(tdList),"\n============\n\n")
if len(tdList) == 9 {
ip := tdList[1]
ip = strings.Replace(ip, " ", "", -1)
port := tdList[2]
port = strings.Replace(port, " ", "", -1)
iptype := tdList[4]
iptype = strings.Replace(iptype, " ", "", -1)
iptype = strings.ToLower(iptype)
ipsList = append(ipsList,fmt.Sprintf("%s://%s:%s",iptype,ip,port))
}
})
}
fmt.Println(ipsList)
}
// xicidaili
// https://www.xicidaili.com/nn/1
// 1~2345
func GetXicidailiIP() {
ipsList := make([]string,0)
for i := 0; i < 2345; i++ {
url := fmt.Sprintf("https://www.xicidaili.com/nn/%d",i)
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find("#body table tbody tr").Each(func(i int, selection *goquery.Selection) {
//fmt.Println(selection.Text(),"\n\n\n")
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
//fmt.Println(tdList,len(tdList),"\n============\n\n")
if len(tdList) == 27 {
ip := tdList[2]
ip = strings.Replace(ip, " ", "", -1)
port := tdList[3]
port = strings.Replace(port, " ", "", -1)
iptype := tdList[8]
iptype = strings.Replace(iptype, " ", "", -1)
iptype = strings.ToLower(iptype)
ipsList = append(ipsList,fmt.Sprintf("%s://%s:%s",iptype,ip,port))
}
})
}
fmt.Println(ipsList)
}
// iphai
// http://www.iphai.com/free/ng
// http://www.iphai.com/free/wg
func GetIphaiIP(){
ipsList := make([]string,0)
iphaiUrlList := []string{"http://www.iphai.com/free/ng","http://www.iphai.com/free/wg"}
for _,url := range(iphaiUrlList){
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(htmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find("table tr").Each(func(i int, selection *goquery.Selection) {
trTxt := selection.Text()
tdList := strings.Split(trTxt, "\n")
if len(tdList) == 16 {
ip := tdList[2]
ip = strings.Replace(ip, " ", "", -1)
port := tdList[4]
port = strings.Replace(port, " ", "", -1)
iptype := tdList[8]
iptype = strings.Replace(iptype, " ", "", -1)
iptype = strings.ToLower(iptype)
if iptype == ""{
ipsList = append(ipsList,fmt.Sprintf("http://%s:%s",ip,port))
redis.ListRpush(redis.GlobalKey, fmt.Sprintf("http://%s:%s",ip,port))
}else{
ipsList = append(ipsList,fmt.Sprintf("%s://%s:%s",iptype,ip,port))
}
}
})
}
fmt.Println(ipsList)
}
// xsdaili
// http://www.xsdaili.com/
func GetXsdailiIP() {
ipsList := make([]string,0)
url := fmt.Sprintf("http://www.xsdaili.com/")
htmlTxt,statusCode := GetData(url)
//fmt.Println(htmlTxt,statusCode)
fmt.Println(statusCode)
urlid :=`<a href="/dayProxy/ip/(.*?).html">`
urlidreg := regexp.MustCompile(urlid)
urlidList := urlidreg.FindAllStringSubmatch(htmlTxt,-1)
//fmt.Println(urlidList)
for _,v := range(urlidList){
ipurl := fmt.Sprintf("http://www.xsdaili.com/dayProxy/ip/%s.html",v[1])
iphtmlTxt,ipstatusCode := GetData(ipurl)
fmt.Println(ipstatusCode)
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(iphtmlTxt))
if err!=nil{
fmt.Println(err)
}
dom.Find(".cont").Each(func(i int, selection *goquery.Selection) {
rest,_ := selection.Html()
//fmt.Println(rest,"\n\n\n")
ipTxt := strings.Split(rest, "<br/>")
//fmt.Println(ipTxt,len(ipTxt),"\n\n\n")
for _,ipv := range(ipTxt){
fmt.Println(ipv)
fgip := strings.Split(ipv, "@")
if len(fgip) > 1{
ip := fgip[0]
notip := fgip[1]
//fmt.Println(ip,"\n\n\n")
iptype := strings.Split(notip, "#")[0]
fmt.Println(ip,iptype,"\n\n\n")
iptype = strings.Replace(iptype, " ", "", -1)
iptype = strings.Replace(iptype, "\n", "", -1)
iptype = strings.ToLower(iptype)
ip = strings.Replace(ip, " ", "", -1)
ip = strings.Replace(ip, "\n", "", -1)
ipsList = append(ipsList,fmt.Sprintf("%s://%s",iptype,ip))
}
}
})
}
fmt.Println(ipsList)
}
/*
parserList = [
{
'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
'type': 'xpath',
'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
},
{
'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
'type': 'xpath',
'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
},
{
'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'],
'type': 'xpath',
'pattern': ".//table[@class='sortable']/tbody/tr",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
},
{
'urls': ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)],
'type': 'xpath',
'pattern': ".//table[@class='list']/tr",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
},
{
'urls': ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)],
'type': 'module',
'moduleName': 'proxy_listPraser',
'pattern': 'Proxy\(.+\)',
'position': {'ip': 0, 'port': -1, 'type': -1, 'protocol': 2}
},
{
'urls': ['http://incloak.com/proxy-list/%s#list' % n for n in
([''] + ['?start=%s' % (64 * m) for m in range(1, 10)])],
'type': 'xpath',
'pattern': ".//table[@class='proxy__t']/tbody/tr",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
},
{
'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 11)],
'type': 'xpath',
'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
},
{
'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in
range(1, 11)],
'type': 'xpath',
'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
},
{
'urls': ['http://www.cz88.net/proxy/%s' % m for m in
['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]],
'type': 'xpath',
'pattern': ".//*[@id='boxright']/div/ul/li[position()>1]",
'position': {'ip': './div[1]', 'port': './div[2]', 'type': './div[3]', 'protocol': ''}
},
{
'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)],
'type': 'xpath',
'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
},
{
'urls': ['http://www.xicidaili.com/%s/%s' % (m, n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8)],
'type': 'xpath',
'pattern': ".//*[@id='ip_list']/tr[position()>1]",
'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]', 'protocol': './td[6]'}
},
{
'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
'type': 'module',
'moduleName': 'CnproxyPraser',
'pattern': r'<tr><td>(\d+\.\d+\.\d+\.\d+)<SCRIPT type=text/javascript>document.write\(\"\:\"(.+)\)</SCRIPT></td><td>(HTTP|SOCKS4)\s*',
'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
}
]
'''
*/
喜欢编程的朋友可以关注一波ManGe,随时分享我的更多实例。
Recommend
-
99
:beetle: kafka_cluster_example 项目获取: $ git clone --depth=1 https://github.com/ErikJiang/kafka...
-
28
最近用golang采集网页中遇到了各种不能识别的的乱码字符串,他们大多编码是gbk、gb2312、big5、windows-1252 等编码。有时候,网页上并没有声明编码,却使用上面这种编码的网页也有,也有网页声明的编码和实际使用的编码不同的网页,导致...
-
19
点击上方“3D视觉工坊”,选择“星标”干货第一时间送达机器人作为...
-
15
前言近期读取了一些最新基于RGB图像下的机器人抓取论文,在这里分享下思路。1、Optimizing Correlated Graspability Score and Grasp Regression for Better Grasp Prediction本文提出...
-
19
基于RGB-D相机的机械臂无序抓取系统 ...
-
3
基于gathertool高并发抓取阳光高考招生简章 mange · 1天之前 · 116 次点击 · 预计阅读时间 6 分钟 ·...
-
11
使用Chrome快速实现数据的抓取(一)—— 概述 对于一些简单的网页,我们可以非常容易的通过Develop Tool来获取其请求报文...
-
6
基于Golang和WebSocket打造自已的反向代理 onyas · 5天之前 · 696 次点击 ·...
-
8
HttpClient+Jsoup实现网络爬虫抓取京东商品数据信息 <!--SpringMVC--> <dependency> <groupId>org.springframework.boot</groupId> <...
-
10
为什么你在进行网页抓取时需要代理? 到底什么是代理? 在你建立你自己的代理网络之前,你需要了解代理在网页抓取这个术语中的真正含义。只要你知道了代理是什么,你就会理解它怎么帮你绕过网站的拦截。 IP地址...
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK