diff options
author | René 'Necoro' Neumann <necoro@necoro.eu> | 2020-04-26 00:23:49 +0200 |
---|---|---|
committer | René 'Necoro' Neumann <necoro@necoro.eu> | 2020-04-26 00:23:49 +0200 |
commit | a83e9bd36fc6a934553d93cbcc0edb120321f971 (patch) | |
tree | a9cc04ca3a0f8392f6ccbf470b323794e535ae7a | |
parent | 87fd0ad3e9309064fe9fda373035f5ce127ae8d8 (diff) | |
download | feed2imap-go-a83e9bd36fc6a934553d93cbcc0edb120321f971.tar.gz feed2imap-go-a83e9bd36fc6a934553d93cbcc0edb120321f971.tar.bz2 feed2imap-go-a83e9bd36fc6a934553d93cbcc0edb120321f971.zip |
Filtering from cache
Diffstat (limited to '')
-rw-r--r-- | internal/feed/cache.go | 2 | ||||
-rw-r--r-- | internal/feed/cache_v1.go | 155 | ||||
-rw-r--r-- | internal/feed/feed.go | 14 | ||||
-rw-r--r-- | internal/feed/mail.go | 2 | ||||
-rw-r--r-- | internal/feed/parse.go | 2 | ||||
-rw-r--r-- | internal/feed/state.go | 39 | ||||
-rw-r--r-- | main.go | 5 | ||||
-rw-r--r-- | pkg/log/log.go | 4 | ||||
-rw-r--r-- | pkg/util/util.go | 10 |
9 files changed, 219 insertions, 14 deletions
diff --git a/internal/feed/cache.go b/internal/feed/cache.go index 735a7d6..2a51cb5 100644 --- a/internal/feed/cache.go +++ b/internal/feed/cache.go @@ -27,6 +27,8 @@ type CachedFeed interface { Checked(withFailure bool) Failures() uint Last() time.Time + filterItems([]feeditem) []feeditem + Commit() } func cacheForVersion(version Version) (Cache, error) { diff --git a/internal/feed/cache_v1.go b/internal/feed/cache_v1.go index 1c53239..a8e40ed 100644 --- a/internal/feed/cache_v1.go +++ b/internal/feed/cache_v1.go @@ -2,9 +2,11 @@ package feed import ( "crypto/sha256" + "fmt" "time" "github.com/Necoro/feed2imap-go/pkg/log" + "github.com/Necoro/feed2imap-go/pkg/util" ) const ( @@ -21,25 +23,37 @@ type v1Cache struct { } type cachedFeed struct { - LastCheck time.Time - NumFailures uint // can't be named `Failures` b/c it'll collide with the interface - Items []cachedItem + LastCheck time.Time + currentCheck time.Time + NumFailures uint // can't be named `Failures` b/c it'll collide with the interface + Items []cachedItem + newItems []cachedItem } type itemHash [sha256.Size]byte type cachedItem struct { - Uid string - Title string - Link string - Date time.Time - Updated time.Time - Creator string - Hash itemHash + Guid string + Title string + Link string + PublishedDate time.Time + UpdatedDate time.Time + UpdatedCache time.Time + Hash itemHash +} + +func (item cachedItem) String() string { + return fmt.Sprintf(`{ + Title: %q + Guid: %q + Link: %q + Published: %s + Updated: %s +}`, item.Title, item.Guid, item.Link, util.TimeFormat(item.PublishedDate), util.TimeFormat(item.UpdatedDate)) } func (cf *cachedFeed) Checked(withFailure bool) { - cf.LastCheck = time.Now() + cf.currentCheck = time.Now() if withFailure { cf.NumFailures++ } else { @@ -47,6 +61,12 @@ func (cf *cachedFeed) Checked(withFailure bool) { } } +func (cf *cachedFeed) Commit() { + cf.Items = cf.newItems + cf.newItems = nil + cf.LastCheck = cf.currentCheck +} + func (cf *cachedFeed) Failures() uint { return cf.NumFailures } @@ -118,3 +138,116 @@ func (cache *v1Cache) findItem(feed *Feed) CachedFeed { feed.cached = item return item } + +func newCachedItem(item feeditem) cachedItem { + var ci cachedItem + + ci.Title = item.Item.Title + ci.Link = item.Item.Link + if item.Item.PublishedParsed != nil { + ci.PublishedDate = *item.Item.PublishedParsed + } + if item.Item.UpdatedParsed != nil && !item.Item.UpdatedParsed.Equal(ci.PublishedDate) { + ci.UpdatedDate = *item.Item.UpdatedParsed + } + ci.Guid = item.Item.GUID + + contentByte := []byte(item.Item.Description + item.Item.Content) + ci.Hash = sha256.Sum256(contentByte) + + return ci +} + +func (item *cachedItem) similarTo(other *cachedItem, ignoreHash bool) bool { + return other.Title == item.Title || + other.Link == item.Link || + other.PublishedDate.Equal(item.PublishedDate) || + (!ignoreHash && other.Hash == item.Hash) +} + +func (cf *cachedFeed) deleteItem(index int) { + copy(cf.Items[index:], cf.Items[index+1:]) + cf.Items[len(cf.Items)-1] = cachedItem{} + cf.Items = cf.Items[:len(cf.Items)-1] +} + +func (cf *cachedFeed) filterItems(items []feeditem) []feeditem { + if len(items) == 0 { + return items + } + + cacheItems := make(map[cachedItem]*feeditem, len(items)) + for idx := range items { + // remove complete duplicates on the go + cacheItems[newCachedItem(items[idx])] = &items[idx] + } + log.Debugf("%d items after deduplication", len(cacheItems)) + + filtered := make([]feeditem, 0, len(items)) + cacheadd := make([]cachedItem, 0, len(items)) + app := func(item *feeditem, ci cachedItem, oldIdx *int) { + if oldIdx != nil { + item.updateOnly = true + cf.deleteItem(*oldIdx) + } + filtered = append(filtered, *item) + cacheadd = append(cacheadd, ci) + } + +CACHE_ITEMS: + for ci, item := range cacheItems { + log.Debugf("Now checking %s", ci) + if cf.LastCheck.IsZero() || ci.PublishedDate.After(cf.LastCheck) { + log.Debug("Newer than last check, including.") + + item.addReason("newer") + app(item, ci, nil) + continue + } + + if ci.Guid != "" { + for idx, oldItem := range cf.Items { + if oldItem.Guid == ci.Guid { + log.Debugf("Guid matches with: %s", oldItem) + if !oldItem.similarTo(&ci, false) { + item.addReason("guid (upd)") + app(item, ci, &idx) + } else { + log.Debugf("Similar, ignoring") + } + + continue CACHE_ITEMS + } + } + + log.Debug("Found no matching GUID, including.") + item.addReason("guid") + app(item, ci, nil) + continue + } + + for idx, oldItem := range cf.Items { + if oldItem.similarTo(&ci, false) { + log.Debugf("Similarity matches, ignoring: %s", oldItem) + continue CACHE_ITEMS + } + + if oldItem.Link == ci.Link { + log.Debugf("Link matches, updating: %s", oldItem) + item.addReason("link (upd)") + app(item, ci, &idx) + + continue CACHE_ITEMS + } + } + + log.Debugf("No match found, inserting.") + app(item, ci, nil) + } + + log.Debugf("%d items after filtering", len(filtered)) + + cf.newItems = append(cacheadd, cf.Items...) + + return filtered +} diff --git a/internal/feed/feed.go b/internal/feed/feed.go index 6b95f6e..433d080 100644 --- a/internal/feed/feed.go +++ b/internal/feed/feed.go @@ -24,6 +24,8 @@ type feedDescriptor struct { type feeditem struct { *gofeed.Feed *gofeed.Item + updateOnly bool + reasons []string } func (item feeditem) Creator() string { @@ -33,6 +35,10 @@ func (item feeditem) Creator() string { return "" } +func (item feeditem) addReason(reason string) { + item.reasons = append(item.reasons, reason) +} + func (feed *Feed) descriptor() feedDescriptor { return feedDescriptor{ Name: feed.Name, @@ -51,6 +57,12 @@ func (feed *Feed) NeedsUpdate(updateTime time.Time) bool { return true } -func (feed *Feed) Success() bool { +func (feed *Feed) FetchSuccessful() bool { return feed.feed != nil } + +func (feed *Feed) MarkSuccess() { + if feed.cached != nil { + feed.cached.Commit() + } +} diff --git a/internal/feed/mail.go b/internal/feed/mail.go index d07f8cf..620b444 100644 --- a/internal/feed/mail.go +++ b/internal/feed/mail.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "io" + "strings" "time" "github.com/emersion/go-message/mail" @@ -40,6 +41,7 @@ func writeToBuffer(b *bytes.Buffer, feed *Feed, item feeditem, cfg *config.Confi h.SetAddressList("From", fromAdress(feed, item, cfg)) h.SetAddressList("To", address(feed.Name, cfg.DefaultEmail)) h.Add("X-Feed2Imap-Version", config.Version()) + h.Add("X-Feed2Imap-Reason", strings.Join(item.reasons, ",")) { // date date := item.Item.PublishedParsed diff --git a/internal/feed/parse.go b/internal/feed/parse.go index afca971..d7b20ad 100644 --- a/internal/feed/parse.go +++ b/internal/feed/parse.go @@ -27,7 +27,7 @@ func parseFeed(feed *Feed) error { feed.feed = parsedFeed feed.items = make([]feeditem, len(parsedFeed.Items)) for idx, item := range parsedFeed.Items { - feed.items[idx] = feeditem{parsedFeed, item} + feed.items[idx] = feeditem{Feed: parsedFeed, Item: item} } return nil } diff --git a/internal/feed/state.go b/internal/feed/state.go index 2a0a1e1..8ec8dfd 100644 --- a/internal/feed/state.go +++ b/internal/feed/state.go @@ -4,6 +4,7 @@ import ( "sync" "github.com/Necoro/feed2imap-go/pkg/config" + "github.com/Necoro/feed2imap-go/pkg/log" ) type State struct { @@ -50,7 +51,7 @@ func (state *State) Fetch() int { ctr := 0 for _, feed := range state.feeds { - success := feed.Success() + success := feed.FetchSuccessful() feed.cached.Checked(!success) if success { @@ -61,6 +62,42 @@ func (state *State) Fetch() int { return ctr } +func filterFeed(feed *Feed, group *sync.WaitGroup) { + if len(feed.items) > 0 { + origLen := len(feed.items) + + log.Debugf("Filtering %s. Starting with %d items", feed.Name, origLen) + items := feed.cached.filterItems(feed.items) + feed.items = items + + newLen := len(feed.items) + if newLen < origLen { + log.Printf("Filtered %s. Reduced from %d to %d items.", feed.Name, origLen, newLen) + } else { + log.Printf("Filtered %s, no reduction.", feed.Name) + } + + } else { + log.Debugf("No items for %s. No filtering.", feed.Name) + } + + if group != nil { + // group is nil in debug case + group.Done() + } +} + +func (state *State) Filter() { + if log.IsDebug() { + // single threaded for better output + state.Foreach(func(f *Feed) { + filterFeed(f, nil) + }) + } else { + state.ForeachGo(filterFeed) + } +} + func NewState(cfg *config.Config) *State { state := State{ feeds: map[string]*Feed{}, @@ -28,6 +28,7 @@ func processFeed(feed *feed.Feed, cfg *config.Config, client *imap.Client, wg *s } if len(mails) == 0 { + feed.MarkSuccess() return } @@ -43,6 +44,8 @@ func processFeed(feed *feed.Feed, cfg *config.Config, client *imap.Client, wg *s } log.Printf("Uploaded %d messages to '%s' @ %s", len(mails), feed.Name, folder) + + feed.MarkSuccess() } func run() error { @@ -82,6 +85,8 @@ func run() error { return fmt.Errorf("No successfull feed fetch.") } + state.Filter() + imapUrl, err := url.Parse(cfg.Target) if err != nil { return fmt.Errorf("parsing 'target': %w", err) diff --git a/pkg/log/log.go b/pkg/log/log.go index 4c69c1c..1e419e3 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -29,6 +29,10 @@ func SetDebug() { level = debug } +func IsDebug() bool { + return level == debug +} + func Debug(v ...interface{}) { if level <= debug { _ = debugLogger.Output(2, fmt.Sprint(v...)) diff --git a/pkg/util/util.go b/pkg/util/util.go index c5472ab..659c886 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -1,5 +1,7 @@ package util +import "time" + func StrContains(haystack []string, needle string) bool { for _, s := range haystack { if s == needle { @@ -9,3 +11,11 @@ func StrContains(haystack []string, needle string) bool { return false } + +func TimeFormat(t time.Time) string { + if t.IsZero() { + return "not set" + } else { + return t.Format(time.ANSIC) + } +} |