7

Online Taobao Item to reStructuredText Image on Google App Engine Go

 2 years ago
source link: http://siongui.github.io/2016/05/14/gae-go-online-taobao-item-to-rst/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

Online Taobao Item to reStructuredText Image on Google App Engine Go

May 14, 2016

Online service on Google App Engine Go, which helps you extract title, image URL from Taobao item webpage, and output in reStructuredText format.

Online Taobao Item to reStructuredText

Source code:

Makefile | repository | view raw

export PATH := $(PATH):$(realpath ../../../../go_appengine/)
PROJECT_DIR=$(CURDIR)
PROJECT_ID=golden-operator-130720
PROJECT_VERSION=taobao-item2rst

default:
	@echo "\033[92mRun development web server ...\033[0m"
	@cd ../; goapp serve ${PROJECT_DIR}

fmt:
	@echo "\033[92mGo fmt source code ...\033[0m"
	@goapp fmt *.go

deploy:
	cd ../; appcfg.py -A ${PROJECT_ID} -V ${PROJECT_VERSION} update ${PROJECT_DIR}
	@echo "\033[92mDeployed URL: http://${PROJECT_VERSION}.${PROJECT_ID}.appspot.com/\033[0m"

install:
	@echo "\033[92mInstall golang.org/x/net/html ...\033[0m"
	@goapp get -u golang.org/x/net/html
	@echo "\033[92mInstall google.golang.org/appengine ...\033[0m"
	@goapp get -u google.golang.org/appengine

app.yaml | repository | view raw

runtime: go
api_version: go1

handlers:
- url: /.*
  script: _go_app
taobaoitem2rst.go | repository | view raw
package taobaoitem2rst

import (
	"html/template"
	"net/http"
)

type TemplateValue struct {
	Textarea string
}

var index = `<!doctype html>
<html>
<head>
  <title>Taobao Item to Rst</title>
</head>
<body>
  <form action="/" method="post">
    URL: <input name="url" size="80">
    <button>Send</button>
  </form><br>
  <textarea id="ta" rows="5" cols="80">{{.Textarea}}</textarea><br>
  <button type="button" id="copy">Copy textarea to clipboard</button>

  <br><br>
  <a target="_blank" href="http://html2rst.golden-operator-130720.appspot.com/">HTML to reStructuredText</a>
  <br><br>
  <a target="_blank" href="http://v1.golden-operator-130720.appspot.com/">URL to reStructuredText</a>

<script>
  var textareaElm = document.getElementById("ta");
  var copyElm = document.getElementById("copy");
  copyElm.onclick = function(event) {
    textareaElm.select();
    var isSuccessful = document.execCommand('copy');
    if (isSuccessful) {
      textareaElm.value = "Copy OK";
    } else {
      textareaElm.value = "Copy Fail";
    }
  }
</script>

</body>
</html>`

var tmpl = template.Must(template.New("taobaoitem2rst").Parse(index))

func init() {
	http.HandleFunc("/", handler)
}

func handler(w http.ResponseWriter, r *http.Request) {
	val := TemplateValue{}
	if r.Method == "POST" {
		val.Textarea = getTaobaoItemImgRst(r.PostFormValue("url"), r)
	}

	if err := tmpl.Execute(w, &val); err != nil {
		panic(err)
	}
}

fetch.go | repository | view raw

package taobaoitem2rst

import (
	"bytes"
	"html/template"
	"net/http"

	"google.golang.org/appengine"
	"google.golang.org/appengine/urlfetch"
)

var imgRst = `.. image:: {{ .ImgURL }}
   :alt: {{ .Title }}
   :target: {{ .URL }}
   :align: center`

func getTaobaoItemImgRst(url string, r *http.Request) string {
	nUrl := NormalizeURL(url)
	ctx := appengine.NewContext(r)
	client := urlfetch.Client(ctx)
	resp, err := client.Get(nUrl)
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()

	ii := getTaobaoItemInfo(resp.Body)
	ii.URL = nUrl

	tmpl := template.Must(template.New("imgRst").Parse(imgRst))
	var rst bytes.Buffer
	err = tmpl.Execute(&rst, &ii)
	if err != nil {
		panic(err)
	}

	return rst.String()
}

urlnormalize.go | repository | view raw

package taobaoitem2rst

import (
	"net/url"
)

func NormalizeURL(inputUrl string) string {
	u, err := url.Parse(inputUrl)

	if u.Host != "item.taobao.com" {
		return inputUrl
	}

	if err != nil {
		panic(err)
	}
	u.RawQuery = "id=" + u.Query().Get("id")
	return u.String()
}

iteminfo.go | repository | view raw

package taobaoitem2rst

import (
	"golang.org/x/net/html"
	"io"
)

type ItemInfo struct {
	Title  string
	URL    string
	ImgURL string
}

func GetAttribute(n *html.Node, key string) (string, bool) {
	for _, attr := range n.Attr {
		if attr.Key == key {
			return attr.Val, true
		}
	}
	return "", false
}

func isTitleElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "title"
}

func isLinkElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "link"
}

func isImgElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "img"
}

func isMetaElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "meta"
}

func traverse(n *html.Node, ii *ItemInfo) {
	if isTitleElement(n) {
		ii.Title = n.FirstChild.Data
	}
	if isLinkElement(n) {
		rel, ok := GetAttribute(n, "rel")
		if ok && rel == "canonical" {
			ii.URL, _ = GetAttribute(n, "href")
		}
	}
	if isImgElement(n) {
		// item.taobao.com
		id, ok := GetAttribute(n, "id")
		if ok && id == "J_ImgBooth" {
			ii.ImgURL, _ = GetAttribute(n, "src")
		}
	}
	if isMetaElement(n) {
		// world.taobao.com
		property, ok := GetAttribute(n, "property")
		if ok && property == "og:image" {
			ii.ImgURL, _ = GetAttribute(n, "content")
		}
	}

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		traverse(c, ii)
	}
}

func getTaobaoItemInfo(r io.Reader) ItemInfo {
	ii := ItemInfo{}

	doc, err := html.Parse(r)
	if err != nil {
		panic("Fail to parse html")
	}
	traverse(doc, &ii)

	return ii
}

Tested on: Ubuntu Linux 16.04, Google App Engine SDK for Go 1.9.37.


References:

[1]Google App Engine Go - HTML Link to reStructuredText

[2][Golang] Remove Query String From URL

[3][Golang] Hacker News Link to reStructuredText

[4][Golang] getElementById via net/html Package


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK