From ea115e8bb1145e20ec25d8e1954427244c250a10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Sat, 8 Jan 2022 21:00:01 +0100 Subject: #67: Readability support --- go.mod | 3 +++ go.sum | 10 ++++++++++ internal/feed/mail.go | 53 ++++++++++++++++++++++++++++++++++++++++++--------- pkg/config/body.go | 2 +- 4 files changed, 58 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 4e6ee59..20f6f94 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/Necoro/feed2imap-go go 1.17 require ( + github.com/Necoro/go-readability v0.0.0-20220107222023-364d914a66d0 github.com/Necoro/gofeed v1.1.4-0.20211029114605-b1a032d3e32f github.com/Necoro/html2text v0.0.0-20211029113451-0e111ca632ef github.com/PuerkitoBio/goquery v1.8.0 @@ -22,6 +23,8 @@ require ( github.com/andybalholm/cascadia v1.3.1 // indirect github.com/emersion/go-sasl v0.0.0-20200509203442-7bfe0ed36a21 // indirect github.com/emersion/go-textwrapper v0.0.0-20200911093747-65d896831594 // indirect + github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 // indirect + github.com/gogs/chardet v0.0.0-20191104214054-4b6791f73a28 // indirect github.com/json-iterator/go v1.1.10 // indirect github.com/mattn/go-runewidth v0.0.9 // indirect github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf // indirect diff --git a/go.sum b/go.sum index 9d72ed9..35c21da 100644 --- a/go.sum +++ b/go.sum @@ -1,10 +1,13 @@ github.com/DATA-DOG/go-sqlmock v1.3.3/go.mod h1:f/Ixk793poVmq4qj/V1dPUg2JEAKC73Q5eFN3EC/SaM= +github.com/Necoro/go-readability v0.0.0-20220107222023-364d914a66d0 h1:4tPedWfYpstnlPsXmuMeOJq11a+Ws2tuD+l2RyrSrqA= +github.com/Necoro/go-readability v0.0.0-20220107222023-364d914a66d0/go.mod h1:nk2AaJ7eR19njLHFOnOdTEyrRQJrQLpMjOzcO8LY+bc= github.com/Necoro/gofeed v1.1.4-0.20211029114605-b1a032d3e32f h1:4U61yP/+eEhN03KS4vs9nZb1BIof32BTexlDhumniF8= github.com/Necoro/gofeed v1.1.4-0.20211029114605-b1a032d3e32f/go.mod h1:O4tTFVp3PQj8ZXBtP8BEdN3+FAi0OTFBkNhJ01jHxVU= github.com/Necoro/html2text v0.0.0-20211029113451-0e111ca632ef h1:ug4tZhYWJ2kio0apzEUzuCdFGJw2eIjNm/+Je2DvlE4= github.com/Necoro/html2text v0.0.0-20211029113451-0e111ca632ef/go.mod h1:ZuQPJl0H5qfZKIQp5L5Kun61DZa/XwJR0WQnsaaR1NQ= github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/antonmedv/expr v1.9.0 h1:j4HI3NHEdgDnN9p6oI6Ndr0G5QryMY0FNxT4ONrFDGU= @@ -27,6 +30,10 @@ github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0Fn github.com/gabriel-vasile/mimetype v1.4.0/go.mod h1:fA8fi6KUiG7MgQQ+mEWotXoEOvmxRtOJlERCzSmRvr8= github.com/gdamore/encoding v1.0.0/go.mod h1:alR0ol34c49FCSBLjhosxzcPHQbf2trDkoo5dl+VrEg= github.com/gdamore/tcell v1.3.0/go.mod h1:Hjvr+Ofd+gLglo7RYKxxnzCBmev3BzsS67MebKS4zMM= +github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 h1:zx4B0AiwqKDQq+AgqxWeHwbbLJQeidq20hgfP+aMNWI= +github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65/go.mod h1:NPO1+buE6TYOWhUI98/hXLHHJhunIpXRuvDN4xjkCoE= +github.com/gogs/chardet v0.0.0-20191104214054-4b6791f73a28 h1:gBeyun7mySAKWg7Fb0GOcv0upX9bdaZScs8QcRo8mEY= +github.com/gogs/chardet v0.0.0-20191104214054-4b6791f73a28/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -56,6 +63,7 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/rivo/tview v0.0.0-20200219210816-cd38d7432498/go.mod h1:6lkG1x+13OShEf0EaOCaTQYyB7d5nSbb181KtjlS+84= github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/sanity-io/litter v1.2.0/go.mod h1:JF6pZUFgu2Q0sBZ+HSV35P8TVPI1TTzEwyu9FXAw2W4= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -63,7 +71,9 @@ github.com/stretchr/testify v0.0.0-20161117074351-18a02ba4a312/go.mod h1:a8OnRci github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20210505214959-0714010a04ed/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/sys v0.0.0-20190626150813-e07cf5db2756/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/internal/feed/mail.go b/internal/feed/mail.go index 444f76a..ad669ae 100644 --- a/internal/feed/mail.go +++ b/internal/feed/mail.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "github.com/Necoro/go-readability" "github.com/Necoro/gofeed" "github.com/Necoro/html2text" "github.com/PuerkitoBio/goquery" @@ -18,6 +19,7 @@ import ( "github.com/emersion/go-message/mail" "github.com/gabriel-vasile/mimetype" "golang.org/x/net/html" + "golang.org/x/net/html/charset" "github.com/Necoro/feed2imap-go/internal/feed/template" "github.com/Necoro/feed2imap-go/internal/http" @@ -248,25 +250,53 @@ func getImage(src string, ctx http.Context) ([]byte, string, error) { return img, mimeStr, nil } +func getFullArticle(src string, ctx http.Context) (string, error) { + log.Debugf("Fetching article from '%s'", src) + resp, cancel, err := http.Get(src, ctx) + if err != nil { + return "", fmt.Errorf("fetching from '%s': %w", src, err) + } + defer cancel() + + reader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) + if err != nil { + return "", fmt.Errorf("detecting charset from '%s': %w", src, err) + } + + doc, err := html.Parse(reader) + if err != nil { + return "", fmt.Errorf("parsing body from '%s': %w", src, err) + } + + article, err := readability.FromDocument(doc, resp.Request.URL) + if err != nil { + return "", fmt.Errorf("parsing body from '%s': %w", src, err) + } + + return article.Content, nil +} + func cidNr(idx int) string { return fmt.Sprintf("cid_%d", idx) } -func getBody(content, description string, bodyCfg config.Body) string { +func (item *Item) getBody(bodyCfg config.Body) (string, error) { switch bodyCfg { case "default": - if content != "" { - return content + if item.Content != "" { + return item.Content, nil } - return description + return item.Description, nil case "description": - return description + return item.Description, nil case "content": - return content + return item.Content, nil case "both": - return description + content + return item.Description + item.Content, nil + case "fetch": + return getFullArticle(item.Link, item.feed.Context()) default: - panic(fmt.Sprintf("Unknown value for Body: %v", bodyCfg)) + return "", fmt.Errorf("Unknown value for Body: %v", bodyCfg) } } @@ -318,9 +348,14 @@ func (item *Item) downloadImage(src string) string { } func (item *Item) buildBody() { + var err error feed := item.feed - item.Body = getBody(item.Content, item.Description, feed.Body) + if item.Body, err = item.getBody(feed.Body); err != nil { + log.Errorf("Feed %s: Item %s: Error while fetching body: %s", feed.Name, item.Link, err) + return + } + bodyNode, err := html.Parse(strings.NewReader(item.Body)) if err != nil { log.Errorf("Feed %s: Item %s: Error while parsing html: %s", feed.Name, item.Link, err) diff --git a/pkg/config/body.go b/pkg/config/body.go index d9957d5..3b2f676 100644 --- a/pkg/config/body.go +++ b/pkg/config/body.go @@ -10,7 +10,7 @@ import ( type Body string -var validBody = []string{"default", "both", "content", "description"} +var validBody = []string{"default", "both", "content", "description", "fetch"} func (b *Body) UnmarshalYAML(node *yaml.Node) error { var val string -- cgit v1.2.3