48

网站邮箱email地址定向采集核心代码分享

 4 years ago
source link: https://www.tuicool.com/articles/nmQfM3i
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

邮箱采集demo: http://www.jsanai.com/emailco...

原理:

1、根据要采集的url地址,获取页面html内容,然后采用正则匹配出页面的url列表、邮箱地址列表。

2、获取到url列表及邮箱后分两个异步线程:

①保存邮箱地址;

②分析采集子页面url的邮箱地址;

核心源码(golang):

//采集网站地址入口方法
func CollectEmail(hosturl string) (EmailObj, []string, error) {
    emailObj := new(EmailObj)
    var inhost []string
    //获取主域名
    uparse, err := url.Parse(hosturl)
    if err != nil {
        return *emailObj, inhost, err
    }
    emailObj.Surl = hosturl
    //
    bodystr, err := HttpGetV2(hosturl)
    if err != nil {
        return *emailObj, inhost, errors.New("get request error")
    }
    //是否是gbk编码
    pos := strings.Index(bodystr, "charset=gb")
    pos2 := strings.Index(bodystr, "bg2312")
    if pos != -1 || pos2 != -1 {
        decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes([]byte(bodystr))
        if err != nil {
            return *emailObj, inhost, errors.New("simplifiedchinese coding change error")
        }
        bodystr = string(decodeBytes)
    }
    //获取邮箱地地址
    emailObj.Emails = append(emailObj.Emails, matchEmail(bodystr)...)
    //获取联系手机
    emailObj.Phones = append(emailObj.Phones, matchPhone(bodystr)...)
    //获取内页链接列表
    matchUrls := matchUrls(bodystr)
    for _, item := range matchUrls {
        itemparse, err := url.Parse(item)
        if err != nil {
            continue
        }
        if strings.Index(itemparse.Path, ".js") != -1 || strings.Index(itemparse.Path, ".css") != -1 {
            continue
        }
        if itemparse.Host == uparse.Host {
            inhost = append(inhost, item)
        }
        if itemparse.Scheme != "http" && itemparse.Scheme != "https" {
            if strings.Index(itemparse.Path, "/") == 0 {
                inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+itemparse.Path)
            } else {
                inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+"/"+itemparse.Path)
            }

            continue
        }
    }
    //获取内页email
    inhost = RemoveRepeatedElement(inhost)
    emailObj.Emails = RemoveRepeatedElement(emailObj.Emails)
    return *emailObj, inhost, nil
}
func matchEmail(str string) (email []string) {
    var emailList []string
    //re, _ := regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
    re, _ := regexp.Compile(`<style[\S\s]+?</style>`)
    str = re.ReplaceAllString(str, "")
    //re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
    re, _ = regexp.Compile(`<script[\S\s]+?</script>`)
    str = re.ReplaceAllString(str, "")
    //替换html标签
    re, _ = regexp.Compile(`<[^>]*?>`)
    str = re.ReplaceAllString(str, "")
    //只匹配com com.cn cn org org.cn net
    reg := regexp.MustCompile(`\w+[@|#]{1}\w+\.(com|cn|org|net|org\.cn|com\.cn)`)
    match := reg.FindAllStringSubmatch(str, -1)
    for _, matched := range match {
        emailList = append(emailList, strings.Replace(strings.ToLower(matched[0]), "#", "@", -1))
    }
    return emailList[:]
}
func matchUrls(str string) (urls []string) {
    var urlList []string
    reg := regexp.MustCompile("<a[^>]*?href=[\"|']+([^\"]*?)[\"|'][^>]*?>[^<]*?</a>")
    match := reg.FindAllStringSubmatch(str, -1)
    for _, matched := range match {
        urlList = append(urlList, matched[1])
    }
    return urlList[:]
}

核心代码使用golang实现,有近6个月的实际使用及改进时间,请放心。

其中涉及到爬虫相关内容,由于当前大部分网站都有反爬虫协议,请大家在使用的时候多加注意


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK