123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300 |
- package html2text
- import (
- "bytes"
- "io"
- "regexp"
- "strings"
- "unicode"
- "golang.org/x/net/html"
- "golang.org/x/net/html/atom"
- )
- var (
- spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
- newlineRe = regexp.MustCompile(`\n\n+`)
- )
- type textifyTraverseCtx struct {
- Buf bytes.Buffer
- prefix string
- blockquoteLevel int
- lineLength int
- endsWithSpace bool
- endsWithNewline bool
- justClosedDiv bool
- }
- func (ctx *textifyTraverseCtx) traverse(node *html.Node) error {
- switch node.Type {
- default:
- return ctx.traverseChildren(node)
- case html.TextNode:
- data := strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ")
- return ctx.emit(data)
- case html.ElementNode:
- ctx.justClosedDiv = false
- switch node.DataAtom {
- case atom.Br:
- return ctx.emit("\n")
- case atom.H1, atom.H2, atom.H3:
- subCtx := textifyTraverseCtx{}
- if err := subCtx.traverseChildren(node); err != nil {
- return err
- }
- str := subCtx.Buf.String()
- dividerLen := 0
- for _, line := range strings.Split(str, "\n") {
- if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
- dividerLen = lineLen - 1
- }
- }
- divider := ""
- if node.DataAtom == atom.H1 {
- divider = strings.Repeat("*", dividerLen)
- } else {
- divider = strings.Repeat("-", dividerLen)
- }
- if node.DataAtom == atom.H3 {
- return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
- }
- return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
- case atom.Blockquote:
- ctx.blockquoteLevel++
- ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
- if err := ctx.emit("\n"); err != nil {
- return err
- }
- if ctx.blockquoteLevel == 1 {
- if err := ctx.emit("\n"); err != nil {
- return err
- }
- }
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- ctx.blockquoteLevel--
- ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
- if ctx.blockquoteLevel > 0 {
- ctx.prefix += " "
- }
- return ctx.emit("\n\n")
- case atom.Div:
- if ctx.lineLength > 0 {
- if err := ctx.emit("\n"); err != nil {
- return err
- }
- }
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- var err error
- if ctx.justClosedDiv == false {
- err = ctx.emit("\n")
- }
- ctx.justClosedDiv = true
- return err
- case atom.Li:
- if err := ctx.emit("* "); err != nil {
- return err
- }
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- return ctx.emit("\n")
- case atom.B, atom.Strong:
- subCtx := textifyTraverseCtx{}
- subCtx.endsWithSpace = true
- if err := subCtx.traverseChildren(node); err != nil {
- return err
- }
- str := subCtx.Buf.String()
- return ctx.emit("*" + str + "*")
- case atom.A:
- // If image is the only child, take its alt text as the link text
- if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
- if altText := getAttrVal(img, "alt"); altText != "" {
- ctx.emit(altText)
- }
- } else if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- hrefLink := ""
- if attrVal := getAttrVal(node, "href"); attrVal != "" {
- attrVal = ctx.normalizeHrefLink(attrVal)
- if attrVal != "" {
- hrefLink = "( " + attrVal + " )"
- }
- }
- return ctx.emit(hrefLink)
- case atom.P, atom.Ul, atom.Table:
- if err := ctx.emit("\n\n"); err != nil {
- return err
- }
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- return ctx.emit("\n\n")
- case atom.Tr:
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- return ctx.emit("\n")
- case atom.Style, atom.Script, atom.Head:
- // Ignore the subtree
- return nil
- default:
- return ctx.traverseChildren(node)
- }
- }
- }
- func (ctx *textifyTraverseCtx) traverseChildren(node *html.Node) error {
- for c := node.FirstChild; c != nil; c = c.NextSibling {
- if err := ctx.traverse(c); err != nil {
- return err
- }
- }
- return nil
- }
- func (ctx *textifyTraverseCtx) emit(data string) error {
- if len(data) == 0 {
- return nil
- }
- lines := ctx.breakLongLines(data)
- var err error
- for _, line := range lines {
- runes := []rune(line)
- startsWithSpace := unicode.IsSpace(runes[0])
- if !startsWithSpace && !ctx.endsWithSpace {
- ctx.Buf.WriteByte(' ')
- ctx.lineLength++
- }
- ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
- for _, c := range line {
- _, err = ctx.Buf.WriteString(string(c))
- if err != nil {
- return err
- }
- ctx.lineLength++
- if c == '\n' {
- ctx.lineLength = 0
- if ctx.prefix != "" {
- _, err = ctx.Buf.WriteString(ctx.prefix)
- if err != nil {
- return err
- }
- }
- }
- }
- }
- return nil
- }
- func (ctx *textifyTraverseCtx) breakLongLines(data string) []string {
- // only break lines when we are in blockquotes
- if ctx.blockquoteLevel == 0 {
- return []string{data}
- }
- var ret []string
- runes := []rune(data)
- l := len(runes)
- existing := ctx.lineLength
- if existing >= 74 {
- ret = append(ret, "\n")
- existing = 0
- }
- for l+existing > 74 {
- i := 74 - existing
- for i >= 0 && !unicode.IsSpace(runes[i]) {
- i--
- }
- if i == -1 {
- // no spaces, so go the other way
- i = 74 - existing
- for i < l && !unicode.IsSpace(runes[i]) {
- i++
- }
- }
- ret = append(ret, string(runes[:i])+"\n")
- for i < l && unicode.IsSpace(runes[i]) {
- i++
- }
- runes = runes[i:]
- l = len(runes)
- existing = 0
- }
- if len(runes) > 0 {
- ret = append(ret, string(runes))
- }
- return ret
- }
- func (ctx *textifyTraverseCtx) normalizeHrefLink(link string) string {
- link = strings.TrimSpace(link)
- link = strings.TrimPrefix(link, "mailto:")
- return link
- }
- func getAttrVal(node *html.Node, attrName string) string {
- for _, attr := range node.Attr {
- if attr.Key == attrName {
- return attr.Val
- }
- }
- return ""
- }
- func FromReader(reader io.Reader) (string, error) {
- doc, err := html.Parse(reader)
- if err != nil {
- return "", err
- }
- ctx := textifyTraverseCtx{
- Buf: bytes.Buffer{},
- }
- if err = ctx.traverse(doc); err != nil {
- return "", err
- }
- text := strings.TrimSpace(newlineRe.ReplaceAllString(
- strings.Replace(ctx.Buf.String(), "\n ", "\n", -1), "\n\n"))
- return text, nil
- }
- func FromString(input string) (string, error) {
- text, err := FromReader(strings.NewReader(input))
- if err != nil {
- return "", err
- }
- return text, nil
- }
|