htmlparser.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. package format
  2. import (
  3. "fmt"
  4. "math"
  5. "regexp"
  6. "strings"
  7. "golang.org/x/net/html"
  8. "strconv"
  9. )
  10. var MatrixToURL = regexp.MustCompile("^(?:https?://)?(?:www\\.)?matrix\\.to/#/([#@!+].*)(?:/(\\$.+))?")
  11. type TextConverter func(string) string
  12. type HTMLParser struct {
  13. PillConverter func(mxid, eventID string) string
  14. TabsToSpaces int
  15. Newline string
  16. BoldConverter TextConverter
  17. ItalicConverter TextConverter
  18. StrikethroughConverter TextConverter
  19. UnderlineConverter TextConverter
  20. MonospaceBlockConverter TextConverter
  21. MonospaceConverter TextConverter
  22. }
  23. type TaggedString struct {
  24. string
  25. tag string
  26. }
  27. func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string {
  28. for _, attr := range node.Attr {
  29. if attr.Key == attribute {
  30. return attr.Val
  31. }
  32. }
  33. return ""
  34. }
  35. func Digits(num int) int {
  36. return int(math.Floor(math.Log10(float64(num))) + 1)
  37. }
  38. func (parser *HTMLParser) listToString(node *html.Node, stripLinebreak bool) string {
  39. ordered := node.Data == "ol"
  40. taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, stripLinebreak)
  41. counter := 1
  42. indentLength := 0
  43. if ordered {
  44. start := parser.getAttribute(node, "start")
  45. if len(start) > 0 {
  46. counter, _ = strconv.Atoi(start)
  47. }
  48. longestIndex := (counter - 1) + len(taggedChildren)
  49. indentLength = Digits(longestIndex)
  50. }
  51. indent := strings.Repeat(" ", indentLength+2)
  52. var children []string
  53. for _, child := range taggedChildren {
  54. if child.tag != "li" {
  55. continue
  56. }
  57. var prefix string
  58. if ordered {
  59. indexPadding := indentLength - Digits(counter)
  60. prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding))
  61. } else {
  62. prefix = "● "
  63. }
  64. str := prefix + child.string
  65. counter++
  66. parts := strings.Split(str, "\n")
  67. for i, part := range parts[1:] {
  68. parts[i+1] = indent + part
  69. }
  70. str = strings.Join(parts, "\n")
  71. children = append(children, str)
  72. }
  73. return strings.Join(children, "\n")
  74. }
  75. func (parser *HTMLParser) basicFormatToString(node *html.Node, stripLinebreak bool) string {
  76. str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
  77. switch node.Data {
  78. case "b", "strong":
  79. if parser.BoldConverter != nil {
  80. return parser.BoldConverter(str)
  81. }
  82. return fmt.Sprintf("**%s**", str)
  83. case "i", "em":
  84. if parser.ItalicConverter != nil {
  85. return parser.ItalicConverter(str)
  86. }
  87. return fmt.Sprintf("_%s_", str)
  88. case "s", "del":
  89. if parser.StrikethroughConverter != nil {
  90. return parser.StrikethroughConverter(str)
  91. }
  92. return fmt.Sprintf("~~%s~~", str)
  93. case "u", "ins":
  94. if parser.UnderlineConverter != nil {
  95. return parser.UnderlineConverter(str)
  96. }
  97. case "tt", "code":
  98. if parser.MonospaceConverter != nil {
  99. return parser.MonospaceConverter(str)
  100. }
  101. }
  102. return str
  103. }
  104. func (parser *HTMLParser) headerToString(node *html.Node, stripLinebreak bool) string {
  105. children := parser.nodeToStrings(node.FirstChild, stripLinebreak)
  106. length := int(node.Data[1] - '0')
  107. prefix := strings.Repeat("#", length) + " "
  108. return prefix + strings.Join(children, "")
  109. }
  110. func (parser *HTMLParser) blockquoteToString(node *html.Node, stripLinebreak bool) string {
  111. str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
  112. childrenArr := strings.Split(strings.TrimSpace(str), "\n")
  113. for index, child := range childrenArr {
  114. childrenArr[index] = "> " + child
  115. }
  116. return strings.Join(childrenArr, "\n")
  117. }
  118. func (parser *HTMLParser) linkToString(node *html.Node, stripLinebreak bool) string {
  119. str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
  120. href := parser.getAttribute(node, "href")
  121. if len(href) == 0 {
  122. return str
  123. }
  124. match := MatrixToURL.FindStringSubmatch(href)
  125. if len(match) == 2 || len(match) == 3 {
  126. if parser.PillConverter != nil {
  127. mxid := match[1]
  128. eventID := ""
  129. if len(match) == 3 {
  130. eventID = match[2]
  131. }
  132. return parser.PillConverter(mxid, eventID)
  133. }
  134. return str
  135. }
  136. return fmt.Sprintf("%s (%s)", str, href)
  137. }
  138. func (parser *HTMLParser) tagToString(node *html.Node, stripLinebreak bool) string {
  139. switch node.Data {
  140. case "blockquote":
  141. return parser.blockquoteToString(node, stripLinebreak)
  142. case "ol", "ul":
  143. return parser.listToString(node, stripLinebreak)
  144. case "h1", "h2", "h3", "h4", "h5", "h6":
  145. return parser.headerToString(node, stripLinebreak)
  146. case "br":
  147. return parser.Newline
  148. case "b", "strong", "i", "em", "s", "del", "u", "ins", "tt", "code":
  149. return parser.basicFormatToString(node, stripLinebreak)
  150. case "a":
  151. return parser.linkToString(node, stripLinebreak)
  152. case "p":
  153. return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak) + "\n"
  154. case "pre":
  155. var preStr string
  156. if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
  157. preStr = parser.nodeToString(node.FirstChild.FirstChild, false)
  158. } else {
  159. preStr = parser.nodeToString(node.FirstChild, false)
  160. }
  161. if parser.MonospaceBlockConverter != nil {
  162. return parser.MonospaceBlockConverter(preStr)
  163. }
  164. return preStr
  165. default:
  166. return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
  167. }
  168. }
  169. func (parser *HTMLParser) singleNodeToString(node *html.Node, stripLinebreak bool) TaggedString {
  170. switch node.Type {
  171. case html.TextNode:
  172. if stripLinebreak {
  173. node.Data = strings.Replace(node.Data, "\n", "", -1)
  174. }
  175. return TaggedString{node.Data, "text"}
  176. case html.ElementNode:
  177. return TaggedString{parser.tagToString(node, stripLinebreak), node.Data}
  178. case html.DocumentNode:
  179. return TaggedString{parser.nodeToTagAwareString(node.FirstChild, stripLinebreak), "html"}
  180. default:
  181. return TaggedString{"", "unknown"}
  182. }
  183. }
  184. func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, stripLinebreak bool) (strs []TaggedString) {
  185. for ; node != nil; node = node.NextSibling {
  186. strs = append(strs, parser.singleNodeToString(node, stripLinebreak))
  187. }
  188. return
  189. }
  190. var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
  191. func (parser *HTMLParser) isBlockTag(tag string) bool {
  192. for _, blockTag := range BlockTags {
  193. if tag == blockTag {
  194. return true
  195. }
  196. }
  197. return false
  198. }
  199. func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, stripLinebreak bool) string {
  200. strs := parser.nodeToTaggedStrings(node, stripLinebreak)
  201. var output strings.Builder
  202. for _, str := range strs {
  203. tstr := str.string
  204. if parser.isBlockTag(str.tag) {
  205. tstr = fmt.Sprintf("\n%s\n", tstr)
  206. }
  207. output.WriteString(tstr)
  208. }
  209. return strings.TrimSpace(output.String())
  210. }
  211. func (parser *HTMLParser) nodeToStrings(node *html.Node, stripLinebreak bool) (strs []string) {
  212. for ; node != nil; node = node.NextSibling {
  213. strs = append(strs, parser.singleNodeToString(node, stripLinebreak).string)
  214. }
  215. return
  216. }
  217. func (parser *HTMLParser) nodeToString(node *html.Node, stripLinebreak bool) string {
  218. return strings.Join(parser.nodeToStrings(node, stripLinebreak), "")
  219. }
  220. func (parser *HTMLParser) Parse(htmlData string) string {
  221. if parser.TabsToSpaces >= 0 {
  222. htmlData = strings.Replace(htmlData, "\t", strings.Repeat(" ", parser.TabsToSpaces), -1)
  223. }
  224. node, _ := html.Parse(strings.NewReader(htmlData))
  225. return parser.nodeToTagAwareString(node, true)
  226. }
  227. func HTMLToText(html string) string {
  228. return (&HTMLParser{
  229. TabsToSpaces: 4,
  230. Newline: "\n",
  231. }).Parse(html)
  232. }