html2text.go 6.2 KB


  1. package html2text
  2. import (
  3. "bytes"
  4. "io"
  5. "regexp"
  6. "strings"
  7. "unicode"
  8. "golang.org/x/net/html"
  9. "golang.org/x/net/html/atom"
  10. )
  11. var (
  12. spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
  13. newlineRe = regexp.MustCompile(`\n\n+`)
  14. )
  15. type textifyTraverseCtx struct {
  16. Buf bytes.Buffer
  17. prefix string
  18. blockquoteLevel int
  19. lineLength int
  20. endsWithSpace bool
  21. endsWithNewline bool
  22. justClosedDiv bool
  23. }
  24. func (ctx *textifyTraverseCtx) traverse(node *html.Node) error {
  25. switch node.Type {
  26. default:
  27. return ctx.traverseChildren(node)
  28. case html.TextNode:
  29. data := strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ")
  30. return ctx.emit(data)
  31. case html.ElementNode:
  32. ctx.justClosedDiv = false
  33. switch node.DataAtom {
  34. case atom.Br:
  35. return ctx.emit("\n")
  36. case atom.H1, atom.H2, atom.H3:
  37. subCtx := textifyTraverseCtx{}
  38. if err := subCtx.traverseChildren(node); err != nil {
  39. return err
  40. }
  41. str := subCtx.Buf.String()
  42. dividerLen := 0
  43. for _, line := range strings.Split(str, "\n") {
  44. if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
  45. dividerLen = lineLen - 1
  46. }
  47. }
  48. divider := ""
  49. if node.DataAtom == atom.H1 {
  50. divider = strings.Repeat("*", dividerLen)
  51. } else {
  52. divider = strings.Repeat("-", dividerLen)
  53. }
  54. if node.DataAtom == atom.H3 {
  55. return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
  56. }
  57. return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
  58. case atom.Blockquote:
  59. ctx.blockquoteLevel++
  60. ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
  61. if err := ctx.emit("\n"); err != nil {
  62. return err
  63. }
  64. if ctx.blockquoteLevel == 1 {
  65. if err := ctx.emit("\n"); err != nil {
  66. return err
  67. }
  68. }
  69. if err := ctx.traverseChildren(node); err != nil {
  70. return err
  71. }
  72. ctx.blockquoteLevel--
  73. ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
  74. if ctx.blockquoteLevel > 0 {
  75. ctx.prefix += " "
  76. }
  77. return ctx.emit("\n\n")
  78. case atom.Div:
  79. if ctx.lineLength > 0 {
  80. if err := ctx.emit("\n"); err != nil {
  81. return err
  82. }
  83. }
  84. if err := ctx.traverseChildren(node); err != nil {
  85. return err
  86. }
  87. var err error
  88. if ctx.justClosedDiv == false {
  89. err = ctx.emit("\n")
  90. }
  91. ctx.justClosedDiv = true
  92. return err
  93. case atom.Li:
  94. if err := ctx.emit("* "); err != nil {
  95. return err
  96. }
  97. if err := ctx.traverseChildren(node); err != nil {
  98. return err
  99. }
  100. return ctx.emit("\n")
  101. case atom.B, atom.Strong:
  102. subCtx := textifyTraverseCtx{}
  103. subCtx.endsWithSpace = true
  104. if err := subCtx.traverseChildren(node); err != nil {
  105. return err
  106. }
  107. str := subCtx.Buf.String()
  108. return ctx.emit("*" + str + "*")
  109. case atom.A:
  110. // If image is the only child, take its alt text as the link text
  111. if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
  112. if altText := getAttrVal(img, "alt"); altText != "" {
  113. ctx.emit(altText)
  114. }
  115. } else if err := ctx.traverseChildren(node); err != nil {
  116. return err
  117. }
  118. hrefLink := ""
  119. if attrVal := getAttrVal(node, "href"); attrVal != "" {
  120. attrVal = ctx.normalizeHrefLink(attrVal)
  121. if attrVal != "" {
  122. hrefLink = "( " + attrVal + " )"
  123. }
  124. }
  125. return ctx.emit(hrefLink)
  126. case atom.P, atom.Ul, atom.Table:
  127. if err := ctx.emit("\n\n"); err != nil {
  128. return err
  129. }
  130. if err := ctx.traverseChildren(node); err != nil {
  131. return err
  132. }
  133. return ctx.emit("\n\n")
  134. case atom.Tr:
  135. if err := ctx.traverseChildren(node); err != nil {
  136. return err
  137. }
  138. return ctx.emit("\n")
  139. case atom.Style, atom.Script, atom.Head:
  140. // Ignore the subtree
  141. return nil
  142. default:
  143. return ctx.traverseChildren(node)
  144. }
  145. }
  146. }
  147. func (ctx *textifyTraverseCtx) traverseChildren(node *html.Node) error {
  148. for c := node.FirstChild; c != nil; c = c.NextSibling {
  149. if err := ctx.traverse(c); err != nil {
  150. return err
  151. }
  152. }
  153. return nil
  154. }
  155. func (ctx *textifyTraverseCtx) emit(data string) error {
  156. if len(data) == 0 {
  157. return nil
  158. }
  159. lines := ctx.breakLongLines(data)
  160. var err error
  161. for _, line := range lines {
  162. runes := []rune(line)
  163. startsWithSpace := unicode.IsSpace(runes[0])
  164. if !startsWithSpace && !ctx.endsWithSpace {
  165. ctx.Buf.WriteByte(' ')
  166. ctx.lineLength++
  167. }
  168. ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
  169. for _, c := range line {
  170. _, err = ctx.Buf.WriteString(string(c))
  171. if err != nil {
  172. return err
  173. }
  174. ctx.lineLength++
  175. if c == '\n' {
  176. ctx.lineLength = 0
  177. if ctx.prefix != "" {
  178. _, err = ctx.Buf.WriteString(ctx.prefix)
  179. if err != nil {
  180. return err
  181. }
  182. }
  183. }
  184. }
  185. }
  186. return nil
  187. }
  188. func (ctx *textifyTraverseCtx) breakLongLines(data string) []string {
  189. // only break lines when we are in blockquotes
  190. if ctx.blockquoteLevel == 0 {
  191. return []string{data}
  192. }
  193. var ret []string
  194. runes := []rune(data)
  195. l := len(runes)
  196. existing := ctx.lineLength
  197. if existing >= 74 {
  198. ret = append(ret, "\n")
  199. existing = 0
  200. }
  201. for l+existing > 74 {
  202. i := 74 - existing
  203. for i >= 0 && !unicode.IsSpace(runes[i]) {
  204. i--
  205. }
  206. if i == -1 {
  207. // no spaces, so go the other way
  208. i = 74 - existing
  209. for i < l && !unicode.IsSpace(runes[i]) {
  210. i++
  211. }
  212. }
  213. ret = append(ret, string(runes[:i])+"\n")
  214. for i < l && unicode.IsSpace(runes[i]) {
  215. i++
  216. }
  217. runes = runes[i:]
  218. l = len(runes)
  219. existing = 0
  220. }
  221. if len(runes) > 0 {
  222. ret = append(ret, string(runes))
  223. }
  224. return ret
  225. }
  226. func (ctx *textifyTraverseCtx) normalizeHrefLink(link string) string {
  227. link = strings.TrimSpace(link)
  228. link = strings.TrimPrefix(link, "mailto:")
  229. return link
  230. }
  231. func getAttrVal(node *html.Node, attrName string) string {
  232. for _, attr := range node.Attr {
  233. if attr.Key == attrName {
  234. return attr.Val
  235. }
  236. }
  237. return ""
  238. }
  239. func FromReader(reader io.Reader) (string, error) {
  240. doc, err := html.Parse(reader)
  241. if err != nil {
  242. return "", err
  243. }
  244. ctx := textifyTraverseCtx{
  245. Buf: bytes.Buffer{},
  246. }
  247. if err = ctx.traverse(doc); err != nil {
  248. return "", err
  249. }
  250. text := strings.TrimSpace(newlineRe.ReplaceAllString(
  251. strings.Replace(ctx.Buf.String(), "\n ", "\n", -1), "\n\n"))
  252. return text, nil
  253. }
  254. func FromString(input string) (string, error) {
  255. text, err := FromReader(strings.NewReader(input))
  256. if err != nil {
  257. return "", err
  258. }
  259. return text, nil
  260. }