From cf3e619639a9e1fd182e0c2437a0619e3ee2ab7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Sat, 27 Feb 2021 21:08:02 +0100 Subject: Rename file --- internal/feed/cache/cache_v1.go | 375 ---------------------------------------- internal/feed/cache/v1.go | 375 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 375 insertions(+), 375 deletions(-) delete mode 100644 internal/feed/cache/cache_v1.go create mode 100644 internal/feed/cache/v1.go diff --git a/internal/feed/cache/cache_v1.go b/internal/feed/cache/cache_v1.go deleted file mode 100644 index 7d95d4c..0000000 --- a/internal/feed/cache/cache_v1.go +++ /dev/null @@ -1,375 +0,0 @@ -package cache - -import ( - "crypto/sha256" - "encoding/base64" - "encoding/hex" - "fmt" - "sort" - "strconv" - "strings" - "time" - - "github.com/google/uuid" - - "github.com/Necoro/feed2imap-go/internal/feed" - "github.com/Necoro/feed2imap-go/pkg/log" - "github.com/Necoro/feed2imap-go/pkg/util" -) - -const ( - v1Version Version = 1 - startFeedId uint64 = 1 - maxCacheSize = 1000 -) - -type feedId uint64 - -func (id feedId) String() string { - return strconv.FormatUint(uint64(id), 16) -} - -func idFromString(s string) feedId { - id, _ := strconv.ParseUint(s, 16, 64) - return feedId(id) -} - -type v1Cache struct { - Ids map[feed.Descriptor]feedId - NextId uint64 - Feeds map[feedId]*cachedFeed -} - -type cachedFeed struct { - feed *feed.Feed - id feedId // not saved, has to be set on loading - LastCheck time.Time - currentCheck time.Time - NumFailures int // can't be named `Failures` b/c it'll collide with the interface - Items []cachedItem - newItems []cachedItem -} - -type itemHash [sha256.Size]byte - -func (h itemHash) String() string { - return hex.EncodeToString(h[:]) -} - -type cachedItem struct { - Guid string - Title string - Link string - Date time.Time - UpdatedCache time.Time - Hash itemHash - ID uuid.UUID - deleted bool -} - -func (item cachedItem) String() string { - return fmt.Sprintf(`{ - ID: %s - Title: %q - Guid: %q - Link: %q - Date: %s - Hash: %s -}`, - base64.RawURLEncoding.EncodeToString(item.ID[:]), - item.Title, item.Guid, item.Link, util.TimeFormat(item.Date), item.Hash) -} - -func (cf *cachedFeed) Checked(withFailure bool) { - cf.currentCheck = time.Now() - if withFailure { - cf.NumFailures++ - } else { - cf.NumFailures = 0 - } -} - -func (cf *cachedFeed) Commit() { - if cf.newItems != nil { - cf.Items = cf.newItems - cf.newItems = nil - } - cf.LastCheck = cf.currentCheck -} - -func (cf *cachedFeed) Failures() int { - return cf.NumFailures -} - -func (cf *cachedFeed) Last() time.Time { - return cf.LastCheck -} - -func (cf *cachedFeed) Feed() *feed.Feed { - return cf.feed -} - -func (cache *v1Cache) Version() Version { - return v1Version -} - -func (cache *v1Cache) Info() string { - descriptors := make([]feed.Descriptor, len(cache.Ids)) - i := 0 - for descr := range cache.Ids { - descriptors[i] = descr - i++ - } - - sort.Slice(descriptors, func(i, j int) bool { - return descriptors[i].Name < descriptors[j].Name - }) - - b := strings.Builder{} - for _, descr := range descriptors { - id := cache.Ids[descr] - feed := cache.Feeds[id] - b.WriteString(fmt.Sprintf("%3s: %s (%s) (%d items)\n", id.String(), descr.Name, descr.Url, len(feed.Items))) - } - return b.String() -} - -func (cache *v1Cache) SpecificInfo(i interface{}) string { - id := idFromString(i.(string)) - - b := strings.Builder{} - feed := cache.Feeds[id] - - for descr, fId := range cache.Ids { - if id == fId { - b.WriteString(descr.Name) - b.WriteString(" -- ") - b.WriteString(descr.Url) - b.WriteByte('\n') - break - } - } - - b.WriteString(fmt.Sprintf(` -Last Check: %s -Num Failures: %d -Num Items: %d -`, - util.TimeFormat(feed.LastCheck), - feed.NumFailures, - len(feed.Items))) - - for _, item := range feed.Items { - b.WriteString("\n--------------------\n") - b.WriteString(item.String()) - } - return b.String() -} - -func newV1Cache() *v1Cache { - cache := v1Cache{ - Ids: map[feed.Descriptor]feedId{}, - Feeds: map[feedId]*cachedFeed{}, - NextId: startFeedId, - } - return &cache -} - -func (cache *v1Cache) transformTo(v Version) (Impl, error) { - switch v { - case v1Version: - return cache, nil - default: - return nil, fmt.Errorf("Transformation not supported") - } -} - -func (cache *v1Cache) getItem(id feedId) *cachedFeed { - feed, ok := cache.Feeds[id] - if !ok { - feed = &cachedFeed{} - cache.Feeds[id] = feed - } - feed.id = id - return feed -} - -func (cache *v1Cache) cachedFeed(f *feed.Feed) CachedFeed { - fDescr := f.Descriptor() - id, ok := cache.Ids[fDescr] - if !ok { - var otherId feed.Descriptor - changed := false - for otherId, id = range cache.Ids { - if otherId.Name == fDescr.Name { - log.Warnf("Feed %s seems to have changed URLs: new '%s', old '%s'. Updating.", - fDescr.Name, fDescr.Url, otherId.Url) - changed = true - break - } else if otherId.Url == fDescr.Url { - log.Warnf("Feed with URL '%s' seems to have changed its name: new '%s', old '%s'. Updating.", - fDescr.Url, fDescr.Name, otherId.Name) - changed = true - break - } - } - if changed { - delete(cache.Ids, otherId) - } else { - id = feedId(cache.NextId) - cache.NextId++ - } - - cache.Ids[fDescr] = id - } - - cf := cache.getItem(id) - cf.feed = f - f.SetExtID(id) - return cf -} - -func (cf *cachedFeed) cachedItem(item *feed.Item) cachedItem { - var ci cachedItem - - ci.ID = item.ID - ci.Title = item.Item.Title - ci.Link = item.Item.Link - if item.DateParsed() != nil { - ci.Date = *item.DateParsed() - } - ci.Guid = item.Item.GUID - - contentByte := []byte(item.Item.Description + item.Item.Content) - ci.Hash = sha256.Sum256(contentByte) - - return ci -} - -func (item *cachedItem) similarTo(other *cachedItem, ignoreHash bool) bool { - return other.Title == item.Title && - other.Link == item.Link && - other.Date.Equal(item.Date) && - (ignoreHash || other.Hash == item.Hash) -} - -func (cf *cachedFeed) markItemDeleted(index int) { - cf.Items[index].deleted = true -} - -func (cf *cachedFeed) Filter(items []feed.Item, ignoreHash, alwaysNew bool) []feed.Item { - if len(items) == 0 { - return items - } - - cacheItems := make(map[cachedItem]*feed.Item, len(items)) - for idx := range items { - i := &items[idx] - ci := cf.cachedItem(i) - - // remove complete duplicates on the go - cacheItems[ci] = i - } - log.Debugf("%d items after deduplication", len(cacheItems)) - - filtered := make([]feed.Item, 0, len(items)) - cacheadd := make([]cachedItem, 0, len(items)) - app := func(item *feed.Item, ci cachedItem, oldIdx *int) { - if oldIdx != nil { - item.UpdateOnly = true - prevId := cf.Items[*oldIdx].ID - ci.ID = prevId - item.ID = prevId - log.Debugf("oldIdx: %d, prevId: %s, item.id: %s", *oldIdx, prevId, item.Id()) - cf.markItemDeleted(*oldIdx) - } - filtered = append(filtered, *item) - cacheadd = append(cacheadd, ci) - } - - seen := func(oldIdx int) { - ci := cf.Items[oldIdx] - cf.markItemDeleted(oldIdx) - cacheadd = append(cacheadd, ci) - } - -CACHE_ITEMS: - for ci, item := range cacheItems { - log.Debugf("Now checking %s", ci) - - if ci.Guid != "" { - for idx, oldItem := range cf.Items { - if oldItem.Guid == ci.Guid { - log.Debugf("Guid matches with: %s", oldItem) - if !oldItem.similarTo(&ci, ignoreHash) { - item.AddReason("guid (upd)") - app(item, ci, &idx) - } else { - log.Debugf("Similar, ignoring item %s", base64.RawURLEncoding.EncodeToString(oldItem.ID[:])) - seen(idx) - } - - continue CACHE_ITEMS - } - } - - log.Debug("Found no matching GUID, including.") - item.AddReason("guid") - app(item, ci, nil) - continue - } - - for idx, oldItem := range cf.Items { - if oldItem.similarTo(&ci, ignoreHash) { - log.Debugf("Similarity matches, ignoring: %s", oldItem) - seen(idx) - continue CACHE_ITEMS - } - - if oldItem.Link == ci.Link { - if alwaysNew { - log.Debugf("Link matches, but `always-new`.") - item.AddReason("always-new") - continue - } - log.Debugf("Link matches, updating: %s", oldItem) - item.AddReason("link (upd)") - app(item, ci, &idx) - - continue CACHE_ITEMS - } - } - - log.Debugf("No match found, inserting.") - item.AddReason("new") - app(item, ci, nil) - } - - log.Debugf("%d items after filtering", len(filtered)) - - cf.newItems = append(cacheadd, filterItems(cf.Items)...) - - return filtered -} - -func filterItems(items []cachedItem) []cachedItem { - var n int - - if len(items) < maxCacheSize { - n = len(items) - } else { - n = maxCacheSize - } - - copiedItems := make([]cachedItem, 0, n) - for _, item := range items { - if !item.deleted { - copiedItems = append(copiedItems, item) - if len(copiedItems) >= n { - break - } - } - } - - return copiedItems -} diff --git a/internal/feed/cache/v1.go b/internal/feed/cache/v1.go new file mode 100644 index 0000000..7d95d4c --- /dev/null +++ b/internal/feed/cache/v1.go @@ -0,0 +1,375 @@ +package cache + +import ( + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "fmt" + "sort" + "strconv" + "strings" + "time" + + "github.com/google/uuid" + + "github.com/Necoro/feed2imap-go/internal/feed" + "github.com/Necoro/feed2imap-go/pkg/log" + "github.com/Necoro/feed2imap-go/pkg/util" +) + +const ( + v1Version Version = 1 + startFeedId uint64 = 1 + maxCacheSize = 1000 +) + +type feedId uint64 + +func (id feedId) String() string { + return strconv.FormatUint(uint64(id), 16) +} + +func idFromString(s string) feedId { + id, _ := strconv.ParseUint(s, 16, 64) + return feedId(id) +} + +type v1Cache struct { + Ids map[feed.Descriptor]feedId + NextId uint64 + Feeds map[feedId]*cachedFeed +} + +type cachedFeed struct { + feed *feed.Feed + id feedId // not saved, has to be set on loading + LastCheck time.Time + currentCheck time.Time + NumFailures int // can't be named `Failures` b/c it'll collide with the interface + Items []cachedItem + newItems []cachedItem +} + +type itemHash [sha256.Size]byte + +func (h itemHash) String() string { + return hex.EncodeToString(h[:]) +} + +type cachedItem struct { + Guid string + Title string + Link string + Date time.Time + UpdatedCache time.Time + Hash itemHash + ID uuid.UUID + deleted bool +} + +func (item cachedItem) String() string { + return fmt.Sprintf(`{ + ID: %s + Title: %q + Guid: %q + Link: %q + Date: %s + Hash: %s +}`, + base64.RawURLEncoding.EncodeToString(item.ID[:]), + item.Title, item.Guid, item.Link, util.TimeFormat(item.Date), item.Hash) +} + +func (cf *cachedFeed) Checked(withFailure bool) { + cf.currentCheck = time.Now() + if withFailure { + cf.NumFailures++ + } else { + cf.NumFailures = 0 + } +} + +func (cf *cachedFeed) Commit() { + if cf.newItems != nil { + cf.Items = cf.newItems + cf.newItems = nil + } + cf.LastCheck = cf.currentCheck +} + +func (cf *cachedFeed) Failures() int { + return cf.NumFailures +} + +func (cf *cachedFeed) Last() time.Time { + return cf.LastCheck +} + +func (cf *cachedFeed) Feed() *feed.Feed { + return cf.feed +} + +func (cache *v1Cache) Version() Version { + return v1Version +} + +func (cache *v1Cache) Info() string { + descriptors := make([]feed.Descriptor, len(cache.Ids)) + i := 0 + for descr := range cache.Ids { + descriptors[i] = descr + i++ + } + + sort.Slice(descriptors, func(i, j int) bool { + return descriptors[i].Name < descriptors[j].Name + }) + + b := strings.Builder{} + for _, descr := range descriptors { + id := cache.Ids[descr] + feed := cache.Feeds[id] + b.WriteString(fmt.Sprintf("%3s: %s (%s) (%d items)\n", id.String(), descr.Name, descr.Url, len(feed.Items))) + } + return b.String() +} + +func (cache *v1Cache) SpecificInfo(i interface{}) string { + id := idFromString(i.(string)) + + b := strings.Builder{} + feed := cache.Feeds[id] + + for descr, fId := range cache.Ids { + if id == fId { + b.WriteString(descr.Name) + b.WriteString(" -- ") + b.WriteString(descr.Url) + b.WriteByte('\n') + break + } + } + + b.WriteString(fmt.Sprintf(` +Last Check: %s +Num Failures: %d +Num Items: %d +`, + util.TimeFormat(feed.LastCheck), + feed.NumFailures, + len(feed.Items))) + + for _, item := range feed.Items { + b.WriteString("\n--------------------\n") + b.WriteString(item.String()) + } + return b.String() +} + +func newV1Cache() *v1Cache { + cache := v1Cache{ + Ids: map[feed.Descriptor]feedId{}, + Feeds: map[feedId]*cachedFeed{}, + NextId: startFeedId, + } + return &cache +} + +func (cache *v1Cache) transformTo(v Version) (Impl, error) { + switch v { + case v1Version: + return cache, nil + default: + return nil, fmt.Errorf("Transformation not supported") + } +} + +func (cache *v1Cache) getItem(id feedId) *cachedFeed { + feed, ok := cache.Feeds[id] + if !ok { + feed = &cachedFeed{} + cache.Feeds[id] = feed + } + feed.id = id + return feed +} + +func (cache *v1Cache) cachedFeed(f *feed.Feed) CachedFeed { + fDescr := f.Descriptor() + id, ok := cache.Ids[fDescr] + if !ok { + var otherId feed.Descriptor + changed := false + for otherId, id = range cache.Ids { + if otherId.Name == fDescr.Name { + log.Warnf("Feed %s seems to have changed URLs: new '%s', old '%s'. Updating.", + fDescr.Name, fDescr.Url, otherId.Url) + changed = true + break + } else if otherId.Url == fDescr.Url { + log.Warnf("Feed with URL '%s' seems to have changed its name: new '%s', old '%s'. Updating.", + fDescr.Url, fDescr.Name, otherId.Name) + changed = true + break + } + } + if changed { + delete(cache.Ids, otherId) + } else { + id = feedId(cache.NextId) + cache.NextId++ + } + + cache.Ids[fDescr] = id + } + + cf := cache.getItem(id) + cf.feed = f + f.SetExtID(id) + return cf +} + +func (cf *cachedFeed) cachedItem(item *feed.Item) cachedItem { + var ci cachedItem + + ci.ID = item.ID + ci.Title = item.Item.Title + ci.Link = item.Item.Link + if item.DateParsed() != nil { + ci.Date = *item.DateParsed() + } + ci.Guid = item.Item.GUID + + contentByte := []byte(item.Item.Description + item.Item.Content) + ci.Hash = sha256.Sum256(contentByte) + + return ci +} + +func (item *cachedItem) similarTo(other *cachedItem, ignoreHash bool) bool { + return other.Title == item.Title && + other.Link == item.Link && + other.Date.Equal(item.Date) && + (ignoreHash || other.Hash == item.Hash) +} + +func (cf *cachedFeed) markItemDeleted(index int) { + cf.Items[index].deleted = true +} + +func (cf *cachedFeed) Filter(items []feed.Item, ignoreHash, alwaysNew bool) []feed.Item { + if len(items) == 0 { + return items + } + + cacheItems := make(map[cachedItem]*feed.Item, len(items)) + for idx := range items { + i := &items[idx] + ci := cf.cachedItem(i) + + // remove complete duplicates on the go + cacheItems[ci] = i + } + log.Debugf("%d items after deduplication", len(cacheItems)) + + filtered := make([]feed.Item, 0, len(items)) + cacheadd := make([]cachedItem, 0, len(items)) + app := func(item *feed.Item, ci cachedItem, oldIdx *int) { + if oldIdx != nil { + item.UpdateOnly = true + prevId := cf.Items[*oldIdx].ID + ci.ID = prevId + item.ID = prevId + log.Debugf("oldIdx: %d, prevId: %s, item.id: %s", *oldIdx, prevId, item.Id()) + cf.markItemDeleted(*oldIdx) + } + filtered = append(filtered, *item) + cacheadd = append(cacheadd, ci) + } + + seen := func(oldIdx int) { + ci := cf.Items[oldIdx] + cf.markItemDeleted(oldIdx) + cacheadd = append(cacheadd, ci) + } + +CACHE_ITEMS: + for ci, item := range cacheItems { + log.Debugf("Now checking %s", ci) + + if ci.Guid != "" { + for idx, oldItem := range cf.Items { + if oldItem.Guid == ci.Guid { + log.Debugf("Guid matches with: %s", oldItem) + if !oldItem.similarTo(&ci, ignoreHash) { + item.AddReason("guid (upd)") + app(item, ci, &idx) + } else { + log.Debugf("Similar, ignoring item %s", base64.RawURLEncoding.EncodeToString(oldItem.ID[:])) + seen(idx) + } + + continue CACHE_ITEMS + } + } + + log.Debug("Found no matching GUID, including.") + item.AddReason("guid") + app(item, ci, nil) + continue + } + + for idx, oldItem := range cf.Items { + if oldItem.similarTo(&ci, ignoreHash) { + log.Debugf("Similarity matches, ignoring: %s", oldItem) + seen(idx) + continue CACHE_ITEMS + } + + if oldItem.Link == ci.Link { + if alwaysNew { + log.Debugf("Link matches, but `always-new`.") + item.AddReason("always-new") + continue + } + log.Debugf("Link matches, updating: %s", oldItem) + item.AddReason("link (upd)") + app(item, ci, &idx) + + continue CACHE_ITEMS + } + } + + log.Debugf("No match found, inserting.") + item.AddReason("new") + app(item, ci, nil) + } + + log.Debugf("%d items after filtering", len(filtered)) + + cf.newItems = append(cacheadd, filterItems(cf.Items)...) + + return filtered +} + +func filterItems(items []cachedItem) []cachedItem { + var n int + + if len(items) < maxCacheSize { + n = len(items) + } else { + n = maxCacheSize + } + + copiedItems := make([]cachedItem, 0, n) + for _, item := range items { + if !item.deleted { + copiedItems = append(copiedItems, item) + if len(copiedItems) >= n { + break + } + } + } + + return copiedItems +} -- cgit v1.2.3