4

[Golang] Web Scrape Blogger Post via goquery

 2 years ago
source link: http://siongui.github.io/2018/01/20/go-parse-blogger-post-via-goquery/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

[Golang] Web Scrape Blogger Post via goquery

Updated: January 29, 2018

Fetch a public post on Blogger and extract data via goquery.

We will extract the following data from HTML:

  • PostUrl
  • Title
  • TimeStamp
  • Author
  • Summary
  • Content

The following is complete source code:

parse.go | repository | view raw

package main

import (
	"errors"
	"github.com/PuerkitoBio/goquery"
	"strings"
)

type PostData struct {
	PostUrl   string
	Title     string
	TimeStamp string
	Author    string
	Summary   string
	Content   string
	Tags      string
}

func GetBlogspotTimeStamp(doc *goquery.Document) (string, error) {
	abbr := doc.Find("a.timestamp-link > abbr").First()
	t, ok := abbr.Attr("title")
	if ok {
		return t, nil
	}

	return "", errors.New("cannot find timestamp")
}

func GetBlogspotTitle(doc *goquery.Document) (string, error) {
	t := doc.Find("h3.post-title").First()
	return strings.TrimSpace(t.Text()), nil
}

func GetBlogspotContent(doc *goquery.Document) (string, error) {
	c := doc.Find("div.post-body").First()
	return c.Html()
}

func GetBlogspotUrl(doc *goquery.Document) (string, error) {
	meta := doc.Find("meta[property='og:url']").First()
	u, ok := meta.Attr("content")
	if ok {
		return u, nil
	}

	return "", errors.New("cannot find url")
}

func GetBlogspotSummary(doc *goquery.Document) (string, error) {
	meta := doc.Find("meta[property='og:description']").First()
	d, ok := meta.Attr("content")
	if ok {
		return d, nil
	}

	return "", errors.New("cannot find summary")
}

func GetBlogspotAuthor(doc *goquery.Document) (string, error) {
	a := doc.Find("span.post-author > span.fn").First()
	return a.Text(), nil
}

func GetBlogspotTags(doc *goquery.Document) (string, error) {
	s := doc.Find("span.post-labels > a")
	labels := ""
	s.Each(func(_ int, l *goquery.Selection) {
		if labels != "" {
			labels += ", "
		}
		labels += l.Text()
	})
	return labels, nil
}

func ParseBlogspotPost(doc *goquery.Document) (*PostData, error) {
	bs := PostData{}
	var err error

	bs.TimeStamp, err = GetBlogspotTimeStamp(doc)
	if err != nil {
		return &bs, err
	}

	bs.Title, err = GetBlogspotTitle(doc)
	if err != nil {
		return &bs, err
	}

	bs.Content, err = GetBlogspotContent(doc)
	if err != nil {
		return &bs, err
	}

	bs.PostUrl, err = GetBlogspotUrl(doc)
	if err != nil {
		return &bs, err
	}

	bs.Summary, err = GetBlogspotSummary(doc)
	if err != nil {
		return &bs, err
	}

	bs.Author, err = GetBlogspotAuthor(doc)
	if err != nil {
		return &bs, err
	}

	bs.Tags, err = GetBlogspotTags(doc)
	if err != nil {
		return &bs, err
	}

	return &bs, nil
}

func main() {
	//url := "https://oathbystyx.blogspot.tw/2018/01/descartes-rules-of-signs.html"
	url := "https://timrau.blogspot.com/2017/11/avoid-vim-overwriting-indention-settings.html"
	doc, err := goquery.NewDocument(url)
	if err != nil {
		panic(err)
	}

	post, err := ParseBlogspotPost(doc)
	if err != nil {
		panic(err)
	}

	println(post.TimeStamp)
	println(post.Title)
	println(post.Content)
	println(post.PostUrl)
	println(post.Summary)
	println(post.Author)
	println(post.Tags)
}

Tested on: Ubuntu Linux 17.10, Go 1.9.2.


References:

[1][Golang] Web Scrape Facebook Post via goquery

[3]Tips and tricks · PuerkitoBio/goquery Wiki · GitHub


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK