aboutsummaryrefslogtreecommitdiff
path: root/internal/feed/cache_v1.go
diff options
context:
space:
mode:
authorRené 'Necoro' Neumann <necoro@necoro.eu>2020-04-26 00:23:49 +0200
committerRené 'Necoro' Neumann <necoro@necoro.eu>2020-04-26 00:23:49 +0200
commita83e9bd36fc6a934553d93cbcc0edb120321f971 (patch)
treea9cc04ca3a0f8392f6ccbf470b323794e535ae7a /internal/feed/cache_v1.go
parent87fd0ad3e9309064fe9fda373035f5ce127ae8d8 (diff)
downloadfeed2imap-go-a83e9bd36fc6a934553d93cbcc0edb120321f971.tar.gz
feed2imap-go-a83e9bd36fc6a934553d93cbcc0edb120321f971.tar.bz2
feed2imap-go-a83e9bd36fc6a934553d93cbcc0edb120321f971.zip
Filtering from cache
Diffstat (limited to 'internal/feed/cache_v1.go')
-rw-r--r--internal/feed/cache_v1.go155
1 files changed, 144 insertions, 11 deletions
diff --git a/internal/feed/cache_v1.go b/internal/feed/cache_v1.go
index 1c53239..a8e40ed 100644
--- a/internal/feed/cache_v1.go
+++ b/internal/feed/cache_v1.go
@@ -2,9 +2,11 @@ package feed
import (
"crypto/sha256"
+ "fmt"
"time"
"github.com/Necoro/feed2imap-go/pkg/log"
+ "github.com/Necoro/feed2imap-go/pkg/util"
)
const (
@@ -21,25 +23,37 @@ type v1Cache struct {
}
type cachedFeed struct {
- LastCheck time.Time
- NumFailures uint // can't be named `Failures` b/c it'll collide with the interface
- Items []cachedItem
+ LastCheck time.Time
+ currentCheck time.Time
+ NumFailures uint // can't be named `Failures` b/c it'll collide with the interface
+ Items []cachedItem
+ newItems []cachedItem
}
type itemHash [sha256.Size]byte
type cachedItem struct {
- Uid string
- Title string
- Link string
- Date time.Time
- Updated time.Time
- Creator string
- Hash itemHash
+ Guid string
+ Title string
+ Link string
+ PublishedDate time.Time
+ UpdatedDate time.Time
+ UpdatedCache time.Time
+ Hash itemHash
+}
+
+func (item cachedItem) String() string {
+ return fmt.Sprintf(`{
+ Title: %q
+ Guid: %q
+ Link: %q
+ Published: %s
+ Updated: %s
+}`, item.Title, item.Guid, item.Link, util.TimeFormat(item.PublishedDate), util.TimeFormat(item.UpdatedDate))
}
func (cf *cachedFeed) Checked(withFailure bool) {
- cf.LastCheck = time.Now()
+ cf.currentCheck = time.Now()
if withFailure {
cf.NumFailures++
} else {
@@ -47,6 +61,12 @@ func (cf *cachedFeed) Checked(withFailure bool) {
}
}
+func (cf *cachedFeed) Commit() {
+ cf.Items = cf.newItems
+ cf.newItems = nil
+ cf.LastCheck = cf.currentCheck
+}
+
func (cf *cachedFeed) Failures() uint {
return cf.NumFailures
}
@@ -118,3 +138,116 @@ func (cache *v1Cache) findItem(feed *Feed) CachedFeed {
feed.cached = item
return item
}
+
+func newCachedItem(item feeditem) cachedItem {
+ var ci cachedItem
+
+ ci.Title = item.Item.Title
+ ci.Link = item.Item.Link
+ if item.Item.PublishedParsed != nil {
+ ci.PublishedDate = *item.Item.PublishedParsed
+ }
+ if item.Item.UpdatedParsed != nil && !item.Item.UpdatedParsed.Equal(ci.PublishedDate) {
+ ci.UpdatedDate = *item.Item.UpdatedParsed
+ }
+ ci.Guid = item.Item.GUID
+
+ contentByte := []byte(item.Item.Description + item.Item.Content)
+ ci.Hash = sha256.Sum256(contentByte)
+
+ return ci
+}
+
+func (item *cachedItem) similarTo(other *cachedItem, ignoreHash bool) bool {
+ return other.Title == item.Title ||
+ other.Link == item.Link ||
+ other.PublishedDate.Equal(item.PublishedDate) ||
+ (!ignoreHash && other.Hash == item.Hash)
+}
+
+func (cf *cachedFeed) deleteItem(index int) {
+ copy(cf.Items[index:], cf.Items[index+1:])
+ cf.Items[len(cf.Items)-1] = cachedItem{}
+ cf.Items = cf.Items[:len(cf.Items)-1]
+}
+
+func (cf *cachedFeed) filterItems(items []feeditem) []feeditem {
+ if len(items) == 0 {
+ return items
+ }
+
+ cacheItems := make(map[cachedItem]*feeditem, len(items))
+ for idx := range items {
+ // remove complete duplicates on the go
+ cacheItems[newCachedItem(items[idx])] = &items[idx]
+ }
+ log.Debugf("%d items after deduplication", len(cacheItems))
+
+ filtered := make([]feeditem, 0, len(items))
+ cacheadd := make([]cachedItem, 0, len(items))
+ app := func(item *feeditem, ci cachedItem, oldIdx *int) {
+ if oldIdx != nil {
+ item.updateOnly = true
+ cf.deleteItem(*oldIdx)
+ }
+ filtered = append(filtered, *item)
+ cacheadd = append(cacheadd, ci)
+ }
+
+CACHE_ITEMS:
+ for ci, item := range cacheItems {
+ log.Debugf("Now checking %s", ci)
+ if cf.LastCheck.IsZero() || ci.PublishedDate.After(cf.LastCheck) {
+ log.Debug("Newer than last check, including.")
+
+ item.addReason("newer")
+ app(item, ci, nil)
+ continue
+ }
+
+ if ci.Guid != "" {
+ for idx, oldItem := range cf.Items {
+ if oldItem.Guid == ci.Guid {
+ log.Debugf("Guid matches with: %s", oldItem)
+ if !oldItem.similarTo(&ci, false) {
+ item.addReason("guid (upd)")
+ app(item, ci, &idx)
+ } else {
+ log.Debugf("Similar, ignoring")
+ }
+
+ continue CACHE_ITEMS
+ }
+ }
+
+ log.Debug("Found no matching GUID, including.")
+ item.addReason("guid")
+ app(item, ci, nil)
+ continue
+ }
+
+ for idx, oldItem := range cf.Items {
+ if oldItem.similarTo(&ci, false) {
+ log.Debugf("Similarity matches, ignoring: %s", oldItem)
+ continue CACHE_ITEMS
+ }
+
+ if oldItem.Link == ci.Link {
+ log.Debugf("Link matches, updating: %s", oldItem)
+ item.addReason("link (upd)")
+ app(item, ci, &idx)
+
+ continue CACHE_ITEMS
+ }
+ }
+
+ log.Debugf("No match found, inserting.")
+ app(item, ci, nil)
+ }
+
+ log.Debugf("%d items after filtering", len(filtered))
+
+ cf.newItems = append(cacheadd, cf.Items...)
+
+ return filtered
+}