From a83e9bd36fc6a934553d93cbcc0edb120321f971 Mon Sep 17 00:00:00 2001 From: René 'Necoro' Neumann Date: Sun, 26 Apr 2020 00:23:49 +0200 Subject: Filtering from cache --- internal/feed/cache_v1.go | 155 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 144 insertions(+), 11 deletions(-) (limited to 'internal/feed/cache_v1.go') diff --git a/internal/feed/cache_v1.go b/internal/feed/cache_v1.go index 1c53239..a8e40ed 100644 --- a/internal/feed/cache_v1.go +++ b/internal/feed/cache_v1.go @@ -2,9 +2,11 @@ package feed import ( "crypto/sha256" + "fmt" "time" "github.com/Necoro/feed2imap-go/pkg/log" + "github.com/Necoro/feed2imap-go/pkg/util" ) const ( @@ -21,25 +23,37 @@ type v1Cache struct { } type cachedFeed struct { - LastCheck time.Time - NumFailures uint // can't be named `Failures` b/c it'll collide with the interface - Items []cachedItem + LastCheck time.Time + currentCheck time.Time + NumFailures uint // can't be named `Failures` b/c it'll collide with the interface + Items []cachedItem + newItems []cachedItem } type itemHash [sha256.Size]byte type cachedItem struct { - Uid string - Title string - Link string - Date time.Time - Updated time.Time - Creator string - Hash itemHash + Guid string + Title string + Link string + PublishedDate time.Time + UpdatedDate time.Time + UpdatedCache time.Time + Hash itemHash +} + +func (item cachedItem) String() string { + return fmt.Sprintf(`{ + Title: %q + Guid: %q + Link: %q + Published: %s + Updated: %s +}`, item.Title, item.Guid, item.Link, util.TimeFormat(item.PublishedDate), util.TimeFormat(item.UpdatedDate)) } func (cf *cachedFeed) Checked(withFailure bool) { - cf.LastCheck = time.Now() + cf.currentCheck = time.Now() if withFailure { cf.NumFailures++ } else { @@ -47,6 +61,12 @@ func (cf *cachedFeed) Checked(withFailure bool) { } } +func (cf *cachedFeed) Commit() { + cf.Items = cf.newItems + cf.newItems = nil + cf.LastCheck = cf.currentCheck +} + func (cf *cachedFeed) Failures() uint { return cf.NumFailures } @@ -118,3 +138,116 @@ func (cache *v1Cache) findItem(feed *Feed) CachedFeed { feed.cached = item return item } + +func newCachedItem(item feeditem) cachedItem { + var ci cachedItem + + ci.Title = item.Item.Title + ci.Link = item.Item.Link + if item.Item.PublishedParsed != nil { + ci.PublishedDate = *item.Item.PublishedParsed + } + if item.Item.UpdatedParsed != nil && !item.Item.UpdatedParsed.Equal(ci.PublishedDate) { + ci.UpdatedDate = *item.Item.UpdatedParsed + } + ci.Guid = item.Item.GUID + + contentByte := []byte(item.Item.Description + item.Item.Content) + ci.Hash = sha256.Sum256(contentByte) + + return ci +} + +func (item *cachedItem) similarTo(other *cachedItem, ignoreHash bool) bool { + return other.Title == item.Title || + other.Link == item.Link || + other.PublishedDate.Equal(item.PublishedDate) || + (!ignoreHash && other.Hash == item.Hash) +} + +func (cf *cachedFeed) deleteItem(index int) { + copy(cf.Items[index:], cf.Items[index+1:]) + cf.Items[len(cf.Items)-1] = cachedItem{} + cf.Items = cf.Items[:len(cf.Items)-1] +} + +func (cf *cachedFeed) filterItems(items []feeditem) []feeditem { + if len(items) == 0 { + return items + } + + cacheItems := make(map[cachedItem]*feeditem, len(items)) + for idx := range items { + // remove complete duplicates on the go + cacheItems[newCachedItem(items[idx])] = &items[idx] + } + log.Debugf("%d items after deduplication", len(cacheItems)) + + filtered := make([]feeditem, 0, len(items)) + cacheadd := make([]cachedItem, 0, len(items)) + app := func(item *feeditem, ci cachedItem, oldIdx *int) { + if oldIdx != nil { + item.updateOnly = true + cf.deleteItem(*oldIdx) + } + filtered = append(filtered, *item) + cacheadd = append(cacheadd, ci) + } + +CACHE_ITEMS: + for ci, item := range cacheItems { + log.Debugf("Now checking %s", ci) + if cf.LastCheck.IsZero() || ci.PublishedDate.After(cf.LastCheck) { + log.Debug("Newer than last check, including.") + + item.addReason("newer") + app(item, ci, nil) + continue + } + + if ci.Guid != "" { + for idx, oldItem := range cf.Items { + if oldItem.Guid == ci.Guid { + log.Debugf("Guid matches with: %s", oldItem) + if !oldItem.similarTo(&ci, false) { + item.addReason("guid (upd)") + app(item, ci, &idx) + } else { + log.Debugf("Similar, ignoring") + } + + continue CACHE_ITEMS + } + } + + log.Debug("Found no matching GUID, including.") + item.addReason("guid") + app(item, ci, nil) + continue + } + + for idx, oldItem := range cf.Items { + if oldItem.similarTo(&ci, false) { + log.Debugf("Similarity matches, ignoring: %s", oldItem) + continue CACHE_ITEMS + } + + if oldItem.Link == ci.Link { + log.Debugf("Link matches, updating: %s", oldItem) + item.addReason("link (upd)") + app(item, ci, &idx) + + continue CACHE_ITEMS + } + } + + log.Debugf("No match found, inserting.") + app(item, ci, nil) + } + + log.Debugf("%d items after filtering", len(filtered)) + + cf.newItems = append(cacheadd, cf.Items...) + + return filtered +} -- cgit v1.2.3-54-g00ecf