aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--internal/feed/cache.go2
-rw-r--r--internal/feed/cache_v1.go155
-rw-r--r--internal/feed/feed.go14
-rw-r--r--internal/feed/mail.go2
-rw-r--r--internal/feed/parse.go2
-rw-r--r--internal/feed/state.go39
-rw-r--r--main.go5
-rw-r--r--pkg/log/log.go4
-rw-r--r--pkg/util/util.go10
9 files changed, 219 insertions, 14 deletions
diff --git a/internal/feed/cache.go b/internal/feed/cache.go
index 735a7d6..2a51cb5 100644
--- a/internal/feed/cache.go
+++ b/internal/feed/cache.go
@@ -27,6 +27,8 @@ type CachedFeed interface {
Checked(withFailure bool)
Failures() uint
Last() time.Time
+ filterItems([]feeditem) []feeditem
+ Commit()
}
func cacheForVersion(version Version) (Cache, error) {
diff --git a/internal/feed/cache_v1.go b/internal/feed/cache_v1.go
index 1c53239..a8e40ed 100644
--- a/internal/feed/cache_v1.go
+++ b/internal/feed/cache_v1.go
@@ -2,9 +2,11 @@ package feed
import (
"crypto/sha256"
+ "fmt"
"time"
"github.com/Necoro/feed2imap-go/pkg/log"
+ "github.com/Necoro/feed2imap-go/pkg/util"
)
const (
@@ -21,25 +23,37 @@ type v1Cache struct {
}
type cachedFeed struct {
- LastCheck time.Time
- NumFailures uint // can't be named `Failures` b/c it'll collide with the interface
- Items []cachedItem
+ LastCheck time.Time
+ currentCheck time.Time
+ NumFailures uint // can't be named `Failures` b/c it'll collide with the interface
+ Items []cachedItem
+ newItems []cachedItem
}
type itemHash [sha256.Size]byte
type cachedItem struct {
- Uid string
- Title string
- Link string
- Date time.Time
- Updated time.Time
- Creator string
- Hash itemHash
+ Guid string
+ Title string
+ Link string
+ PublishedDate time.Time
+ UpdatedDate time.Time
+ UpdatedCache time.Time
+ Hash itemHash
+}
+
+func (item cachedItem) String() string {
+ return fmt.Sprintf(`{
+ Title: %q
+ Guid: %q
+ Link: %q
+ Published: %s
+ Updated: %s
+}`, item.Title, item.Guid, item.Link, util.TimeFormat(item.PublishedDate), util.TimeFormat(item.UpdatedDate))
}
func (cf *cachedFeed) Checked(withFailure bool) {
- cf.LastCheck = time.Now()
+ cf.currentCheck = time.Now()
if withFailure {
cf.NumFailures++
} else {
@@ -47,6 +61,12 @@ func (cf *cachedFeed) Checked(withFailure bool) {
}
}
+func (cf *cachedFeed) Commit() {
+ cf.Items = cf.newItems
+ cf.newItems = nil
+ cf.LastCheck = cf.currentCheck
+}
+
func (cf *cachedFeed) Failures() uint {
return cf.NumFailures
}
@@ -118,3 +138,116 @@ func (cache *v1Cache) findItem(feed *Feed) CachedFeed {
feed.cached = item
return item
}
+
+func newCachedItem(item feeditem) cachedItem {
+ var ci cachedItem
+
+ ci.Title = item.Item.Title
+ ci.Link = item.Item.Link
+ if item.Item.PublishedParsed != nil {
+ ci.PublishedDate = *item.Item.PublishedParsed
+ }
+ if item.Item.UpdatedParsed != nil && !item.Item.UpdatedParsed.Equal(ci.PublishedDate) {
+ ci.UpdatedDate = *item.Item.UpdatedParsed
+ }
+ ci.Guid = item.Item.GUID
+
+ contentByte := []byte(item.Item.Description + item.Item.Content)
+ ci.Hash = sha256.Sum256(contentByte)
+
+ return ci
+}
+
+func (item *cachedItem) similarTo(other *cachedItem, ignoreHash bool) bool {
+ return other.Title == item.Title ||
+ other.Link == item.Link ||
+ other.PublishedDate.Equal(item.PublishedDate) ||
+ (!ignoreHash && other.Hash == item.Hash)
+}
+
+func (cf *cachedFeed) deleteItem(index int) {
+ copy(cf.Items[index:], cf.Items[index+1:])
+ cf.Items[len(cf.Items)-1] = cachedItem{}
+ cf.Items = cf.Items[:len(cf.Items)-1]
+}
+
+func (cf *cachedFeed) filterItems(items []feeditem) []feeditem {
+ if len(items) == 0 {
+ return items
+ }
+
+ cacheItems := make(map[cachedItem]*feeditem, len(items))
+ for idx := range items {
+ // remove complete duplicates on the go
+ cacheItems[newCachedItem(items[idx])] = &items[idx]
+ }
+ log.Debugf("%d items after deduplication", len(cacheItems))
+
+ filtered := make([]feeditem, 0, len(items))
+ cacheadd := make([]cachedItem, 0, len(items))
+ app := func(item *feeditem, ci cachedItem, oldIdx *int) {
+ if oldIdx != nil {
+ item.updateOnly = true
+ cf.deleteItem(*oldIdx)
+ }
+ filtered = append(filtered, *item)
+ cacheadd = append(cacheadd, ci)
+ }
+
+CACHE_ITEMS:
+ for ci, item := range cacheItems {
+ log.Debugf("Now checking %s", ci)
+ if cf.LastCheck.IsZero() || ci.PublishedDate.After(cf.LastCheck) {
+ log.Debug("Newer than last check, including.")
+
+ item.addReason("newer")
+ app(item, ci, nil)
+ continue
+ }
+
+ if ci.Guid != "" {
+ for idx, oldItem := range cf.Items {
+ if oldItem.Guid == ci.Guid {
+ log.Debugf("Guid matches with: %s", oldItem)
+ if !oldItem.similarTo(&ci, false) {
+ item.addReason("guid (upd)")
+ app(item, ci, &idx)
+ } else {
+ log.Debugf("Similar, ignoring")
+ }
+
+ continue CACHE_ITEMS
+ }
+ }
+
+ log.Debug("Found no matching GUID, including.")
+ item.addReason("guid")
+ app(item, ci, nil)
+ continue
+ }
+
+ for idx, oldItem := range cf.Items {
+ if oldItem.similarTo(&ci, false) {
+ log.Debugf("Similarity matches, ignoring: %s", oldItem)
+ continue CACHE_ITEMS
+ }
+
+ if oldItem.Link == ci.Link {
+ log.Debugf("Link matches, updating: %s", oldItem)
+ item.addReason("link (upd)")
+ app(item, ci, &idx)
+
+ continue CACHE_ITEMS
+ }
+ }
+
+ log.Debugf("No match found, inserting.")
+ app(item, ci, nil)
+ }
+
+ log.Debugf("%d items after filtering", len(filtered))
+
+ cf.newItems = append(cacheadd, cf.Items...)
+
+ return filtered
+}
diff --git a/internal/feed/feed.go b/internal/feed/feed.go
index 6b95f6e..433d080 100644
--- a/internal/feed/feed.go
+++ b/internal/feed/feed.go
@@ -24,6 +24,8 @@ type feedDescriptor struct {
type feeditem struct {
*gofeed.Feed
*gofeed.Item
+ updateOnly bool
+ reasons []string
}
func (item feeditem) Creator() string {
@@ -33,6 +35,10 @@ func (item feeditem) Creator() string {
return ""
}
+func (item feeditem) addReason(reason string) {
+ item.reasons = append(item.reasons, reason)
+}
+
func (feed *Feed) descriptor() feedDescriptor {
return feedDescriptor{
Name: feed.Name,
@@ -51,6 +57,12 @@ func (feed *Feed) NeedsUpdate(updateTime time.Time) bool {
return true
}
-func (feed *Feed) Success() bool {
+func (feed *Feed) FetchSuccessful() bool {
return feed.feed != nil
}
+
+func (feed *Feed) MarkSuccess() {
+ if feed.cached != nil {
+ feed.cached.Commit()
+ }
+}
diff --git a/internal/feed/mail.go b/internal/feed/mail.go
index d07f8cf..620b444 100644
--- a/internal/feed/mail.go
+++ b/internal/feed/mail.go
@@ -4,6 +4,7 @@ import (
"bytes"
"fmt"
"io"
+ "strings"
"time"
"github.com/emersion/go-message/mail"
@@ -40,6 +41,7 @@ func writeToBuffer(b *bytes.Buffer, feed *Feed, item feeditem, cfg *config.Confi
h.SetAddressList("From", fromAdress(feed, item, cfg))
h.SetAddressList("To", address(feed.Name, cfg.DefaultEmail))
h.Add("X-Feed2Imap-Version", config.Version())
+ h.Add("X-Feed2Imap-Reason", strings.Join(item.reasons, ","))
{ // date
date := item.Item.PublishedParsed
diff --git a/internal/feed/parse.go b/internal/feed/parse.go
index afca971..d7b20ad 100644
--- a/internal/feed/parse.go
+++ b/internal/feed/parse.go
@@ -27,7 +27,7 @@ func parseFeed(feed *Feed) error {
feed.feed = parsedFeed
feed.items = make([]feeditem, len(parsedFeed.Items))
for idx, item := range parsedFeed.Items {
- feed.items[idx] = feeditem{parsedFeed, item}
+ feed.items[idx] = feeditem{Feed: parsedFeed, Item: item}
}
return nil
}
diff --git a/internal/feed/state.go b/internal/feed/state.go
index 2a0a1e1..8ec8dfd 100644
--- a/internal/feed/state.go
+++ b/internal/feed/state.go
@@ -4,6 +4,7 @@ import (
"sync"
"github.com/Necoro/feed2imap-go/pkg/config"
+ "github.com/Necoro/feed2imap-go/pkg/log"
)
type State struct {
@@ -50,7 +51,7 @@ func (state *State) Fetch() int {
ctr := 0
for _, feed := range state.feeds {
- success := feed.Success()
+ success := feed.FetchSuccessful()
feed.cached.Checked(!success)
if success {
@@ -61,6 +62,42 @@ func (state *State) Fetch() int {
return ctr
}
+func filterFeed(feed *Feed, group *sync.WaitGroup) {
+ if len(feed.items) > 0 {
+ origLen := len(feed.items)
+
+ log.Debugf("Filtering %s. Starting with %d items", feed.Name, origLen)
+ items := feed.cached.filterItems(feed.items)
+ feed.items = items
+
+ newLen := len(feed.items)
+ if newLen < origLen {
+ log.Printf("Filtered %s. Reduced from %d to %d items.", feed.Name, origLen, newLen)
+ } else {
+ log.Printf("Filtered %s, no reduction.", feed.Name)
+ }
+
+ } else {
+ log.Debugf("No items for %s. No filtering.", feed.Name)
+ }
+
+ if group != nil {
+ // group is nil in debug case
+ group.Done()
+ }
+}
+
+func (state *State) Filter() {
+ if log.IsDebug() {
+ // single threaded for better output
+ state.Foreach(func(f *Feed) {
+ filterFeed(f, nil)
+ })
+ } else {
+ state.ForeachGo(filterFeed)
+ }
+}
+
func NewState(cfg *config.Config) *State {
state := State{
feeds: map[string]*Feed{},
diff --git a/main.go b/main.go
index 026ef0a..5b76871 100644
--- a/main.go
+++ b/main.go
@@ -28,6 +28,7 @@ func processFeed(feed *feed.Feed, cfg *config.Config, client *imap.Client, wg *s
}
if len(mails) == 0 {
+ feed.MarkSuccess()
return
}
@@ -43,6 +44,8 @@ func processFeed(feed *feed.Feed, cfg *config.Config, client *imap.Client, wg *s
}
log.Printf("Uploaded %d messages to '%s' @ %s", len(mails), feed.Name, folder)
+
+ feed.MarkSuccess()
}
func run() error {
@@ -82,6 +85,8 @@ func run() error {
return fmt.Errorf("No successfull feed fetch.")
}
+ state.Filter()
+
imapUrl, err := url.Parse(cfg.Target)
if err != nil {
return fmt.Errorf("parsing 'target': %w", err)
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 4c69c1c..1e419e3 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -29,6 +29,10 @@ func SetDebug() {
level = debug
}
+func IsDebug() bool {
+ return level == debug
+}
+
func Debug(v ...interface{}) {
if level <= debug {
_ = debugLogger.Output(2, fmt.Sprint(v...))
diff --git a/pkg/util/util.go b/pkg/util/util.go
index c5472ab..659c886 100644
--- a/pkg/util/util.go
+++ b/pkg/util/util.go
@@ -1,5 +1,7 @@
package util
+import "time"
+
func StrContains(haystack []string, needle string) bool {
for _, s := range haystack {
if s == needle {
@@ -9,3 +11,11 @@ func StrContains(haystack []string, needle string) bool {
return false
}
+
+func TimeFormat(t time.Time) string {
+ if t.IsZero() {
+ return "not set"
+ } else {
+ return t.Format(time.ANSIC)
+ }
+}