From a83e9bd36fc6a934553d93cbcc0edb120321f971 Mon Sep 17 00:00:00 2001 From: René 'Necoro' Neumann Date: Sun, 26 Apr 2020 00:23:49 +0200 Subject: Filtering from cache --- internal/feed/cache.go | 2 + internal/feed/cache_v1.go | 155 ++++++++++++++++++++++++++++++++++++++++++---- internal/feed/feed.go | 14 ++++- internal/feed/mail.go | 2 + internal/feed/parse.go | 2 +- internal/feed/state.go | 39 +++++++++++- 6 files changed, 200 insertions(+), 14 deletions(-) (limited to 'internal/feed') diff --git a/internal/feed/cache.go b/internal/feed/cache.go index 735a7d6..2a51cb5 100644 --- a/internal/feed/cache.go +++ b/internal/feed/cache.go @@ -27,6 +27,8 @@ type CachedFeed interface { Checked(withFailure bool) Failures() uint Last() time.Time + filterItems([]feeditem) []feeditem + Commit() } func cacheForVersion(version Version) (Cache, error) { diff --git a/internal/feed/cache_v1.go b/internal/feed/cache_v1.go index 1c53239..a8e40ed 100644 --- a/internal/feed/cache_v1.go +++ b/internal/feed/cache_v1.go @@ -2,9 +2,11 @@ package feed import ( "crypto/sha256" + "fmt" "time" "github.com/Necoro/feed2imap-go/pkg/log" + "github.com/Necoro/feed2imap-go/pkg/util" ) const ( @@ -21,25 +23,37 @@ type v1Cache struct { } type cachedFeed struct { - LastCheck time.Time - NumFailures uint // can't be named `Failures` b/c it'll collide with the interface - Items []cachedItem + LastCheck time.Time + currentCheck time.Time + NumFailures uint // can't be named `Failures` b/c it'll collide with the interface + Items []cachedItem + newItems []cachedItem } type itemHash [sha256.Size]byte type cachedItem struct { - Uid string - Title string - Link string - Date time.Time - Updated time.Time - Creator string - Hash itemHash + Guid string + Title string + Link string + PublishedDate time.Time + UpdatedDate time.Time + UpdatedCache time.Time + Hash itemHash +} + +func (item cachedItem) String() string { + return fmt.Sprintf(`{ + Title: %q + Guid: %q + Link: %q + Published: %s + Updated: %s +}`, item.Title, item.Guid, item.Link, util.TimeFormat(item.PublishedDate), util.TimeFormat(item.UpdatedDate)) } func (cf *cachedFeed) Checked(withFailure bool) { - cf.LastCheck = time.Now() + cf.currentCheck = time.Now() if withFailure { cf.NumFailures++ } else { @@ -47,6 +61,12 @@ func (cf *cachedFeed) Checked(withFailure bool) { } } +func (cf *cachedFeed) Commit() { + cf.Items = cf.newItems + cf.newItems = nil + cf.LastCheck = cf.currentCheck +} + func (cf *cachedFeed) Failures() uint { return cf.NumFailures } @@ -118,3 +138,116 @@ func (cache *v1Cache) findItem(feed *Feed) CachedFeed { feed.cached = item return item } + +func newCachedItem(item feeditem) cachedItem { + var ci cachedItem + + ci.Title = item.Item.Title + ci.Link = item.Item.Link + if item.Item.PublishedParsed != nil { + ci.PublishedDate = *item.Item.PublishedParsed + } + if item.Item.UpdatedParsed != nil && !item.Item.UpdatedParsed.Equal(ci.PublishedDate) { + ci.UpdatedDate = *item.Item.UpdatedParsed + } + ci.Guid = item.Item.GUID + + contentByte := []byte(item.Item.Description + item.Item.Content) + ci.Hash = sha256.Sum256(contentByte) + + return ci +} + +func (item *cachedItem) similarTo(other *cachedItem, ignoreHash bool) bool { + return other.Title == item.Title || + other.Link == item.Link || + other.PublishedDate.Equal(item.PublishedDate) || + (!ignoreHash && other.Hash == item.Hash) +} + +func (cf *cachedFeed) deleteItem(index int) { + copy(cf.Items[index:], cf.Items[index+1:]) + cf.Items[len(cf.Items)-1] = cachedItem{} + cf.Items = cf.Items[:len(cf.Items)-1] +} + +func (cf *cachedFeed) filterItems(items []feeditem) []feeditem { + if len(items) == 0 { + return items + } + + cacheItems := make(map[cachedItem]*feeditem, len(items)) + for idx := range items { + // remove complete duplicates on the go + cacheItems[newCachedItem(items[idx])] = &items[idx] + } + log.Debugf("%d items after deduplication", len(cacheItems)) + + filtered := make([]feeditem, 0, len(items)) + cacheadd := make([]cachedItem, 0, len(items)) + app := func(item *feeditem, ci cachedItem, oldIdx *int) { + if oldIdx != nil { + item.updateOnly = true + cf.deleteItem(*oldIdx) + } + filtered = append(filtered, *item) + cacheadd = append(cacheadd, ci) + } + +CACHE_ITEMS: + for ci, item := range cacheItems { + log.Debugf("Now checking %s", ci) + if cf.LastCheck.IsZero() || ci.PublishedDate.After(cf.LastCheck) { + log.Debug("Newer than last check, including.") + + item.addReason("newer") + app(item, ci, nil) + continue + } + + if ci.Guid != "" { + for idx, oldItem := range cf.Items { + if oldItem.Guid == ci.Guid { + log.Debugf("Guid matches with: %s", oldItem) + if !oldItem.similarTo(&ci, false) { + item.addReason("guid (upd)") + app(item, ci, &idx) + } else { + log.Debugf("Similar, ignoring") + } + + continue CACHE_ITEMS + } + } + + log.Debug("Found no matching GUID, including.") + item.addReason("guid") + app(item, ci, nil) + continue + } + + for idx, oldItem := range cf.Items { + if oldItem.similarTo(&ci, false) { + log.Debugf("Similarity matches, ignoring: %s", oldItem) + continue CACHE_ITEMS + } + + if oldItem.Link == ci.Link { + log.Debugf("Link matches, updating: %s", oldItem) + item.addReason("link (upd)") + app(item, ci, &idx) + + continue CACHE_ITEMS + } + } + + log.Debugf("No match found, inserting.") + app(item, ci, nil) + } + + log.Debugf("%d items after filtering", len(filtered)) + + cf.newItems = append(cacheadd, cf.Items...) + + return filtered +} diff --git a/internal/feed/feed.go b/internal/feed/feed.go index 6b95f6e..433d080 100644 --- a/internal/feed/feed.go +++ b/internal/feed/feed.go @@ -24,6 +24,8 @@ type feedDescriptor struct { type feeditem struct { *gofeed.Feed *gofeed.Item + updateOnly bool + reasons []string } func (item feeditem) Creator() string { @@ -33,6 +35,10 @@ func (item feeditem) Creator() string { return "" } +func (item feeditem) addReason(reason string) { + item.reasons = append(item.reasons, reason) +} + func (feed *Feed) descriptor() feedDescriptor { return feedDescriptor{ Name: feed.Name, @@ -51,6 +57,12 @@ func (feed *Feed) NeedsUpdate(updateTime time.Time) bool { return true } -func (feed *Feed) Success() bool { +func (feed *Feed) FetchSuccessful() bool { return feed.feed != nil } + +func (feed *Feed) MarkSuccess() { + if feed.cached != nil { + feed.cached.Commit() + } +} diff --git a/internal/feed/mail.go b/internal/feed/mail.go index d07f8cf..620b444 100644 --- a/internal/feed/mail.go +++ b/internal/feed/mail.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "io" + "strings" "time" "github.com/emersion/go-message/mail" @@ -40,6 +41,7 @@ func writeToBuffer(b *bytes.Buffer, feed *Feed, item feeditem, cfg *config.Confi h.SetAddressList("From", fromAdress(feed, item, cfg)) h.SetAddressList("To", address(feed.Name, cfg.DefaultEmail)) h.Add("X-Feed2Imap-Version", config.Version()) + h.Add("X-Feed2Imap-Reason", strings.Join(item.reasons, ",")) { // date date := item.Item.PublishedParsed diff --git a/internal/feed/parse.go b/internal/feed/parse.go index afca971..d7b20ad 100644 --- a/internal/feed/parse.go +++ b/internal/feed/parse.go @@ -27,7 +27,7 @@ func parseFeed(feed *Feed) error { feed.feed = parsedFeed feed.items = make([]feeditem, len(parsedFeed.Items)) for idx, item := range parsedFeed.Items { - feed.items[idx] = feeditem{parsedFeed, item} + feed.items[idx] = feeditem{Feed: parsedFeed, Item: item} } return nil } diff --git a/internal/feed/state.go b/internal/feed/state.go index 2a0a1e1..8ec8dfd 100644 --- a/internal/feed/state.go +++ b/internal/feed/state.go @@ -4,6 +4,7 @@ import ( "sync" "github.com/Necoro/feed2imap-go/pkg/config" + "github.com/Necoro/feed2imap-go/pkg/log" ) type State struct { @@ -50,7 +51,7 @@ func (state *State) Fetch() int { ctr := 0 for _, feed := range state.feeds { - success := feed.Success() + success := feed.FetchSuccessful() feed.cached.Checked(!success) if success { @@ -61,6 +62,42 @@ func (state *State) Fetch() int { return ctr } +func filterFeed(feed *Feed, group *sync.WaitGroup) { + if len(feed.items) > 0 { + origLen := len(feed.items) + + log.Debugf("Filtering %s. Starting with %d items", feed.Name, origLen) + items := feed.cached.filterItems(feed.items) + feed.items = items + + newLen := len(feed.items) + if newLen < origLen { + log.Printf("Filtered %s. Reduced from %d to %d items.", feed.Name, origLen, newLen) + } else { + log.Printf("Filtered %s, no reduction.", feed.Name) + } + + } else { + log.Debugf("No items for %s. No filtering.", feed.Name) + } + + if group != nil { + // group is nil in debug case + group.Done() + } +} + +func (state *State) Filter() { + if log.IsDebug() { + // single threaded for better output + state.Foreach(func(f *Feed) { + filterFeed(f, nil) + }) + } else { + state.ForeachGo(filterFeed) + } +} + func NewState(cfg *config.Config) *State { state := State{ feeds: map[string]*Feed{}, -- cgit v1.2.3-70-g09d2