From 0dbfe37fbddb95c184d845c79bbe014597d55fe8 Mon Sep 17 00:00:00 2001 From: Alan Pearce Date: Thu, 23 May 2024 13:14:45 +0200 Subject: feat: stream files directly from fetcher to importer Use IndexMeta to store the information relevant to making conditional updates in future runs. --- internal/fetcher/http.go | 74 +++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 26 deletions(-) (limited to 'internal/fetcher/http.go') diff --git a/internal/fetcher/http.go b/internal/fetcher/http.go index 9afbbc0..675c3b3 100644 --- a/internal/fetcher/http.go +++ b/internal/fetcher/http.go @@ -3,68 +3,90 @@ package fetcher import ( "context" "fmt" + "io" "log/slog" "net/http" - "os" "searchix/internal/config" - "searchix/internal/file" "strings" "time" + "github.com/andybalholm/brotli" "github.com/pkg/errors" ) -func fetchFileIfNeeded(ctx context.Context, path string, url string) (needed bool, err error) { - stat, err := file.StatIfExists(path) - if err != nil { - return false, errors.WithMessagef(err, "could not stat file %s", path) +type brotliReadCloser struct { + src io.ReadCloser + *brotli.Reader +} + +func newBrotliReader(src io.ReadCloser) *brotliReadCloser { + return &brotliReadCloser{ + src: src, + Reader: brotli.NewReader(src), } +} - var mtime string - if stat != nil { - mtime = strings.Replace(stat.ModTime().UTC().Format(time.RFC1123), "UTC", "GMT", 1) +func (r *brotliReadCloser) Close() error { + return errors.Wrap(r.src.Close(), "failed to call close on underlying reader") +} + +func fetchFileIfNeeded( + ctx context.Context, + mtime time.Time, + url string, +) (body io.ReadCloser, newMtime time.Time, err error) { + var ifModifiedSince string + if !mtime.IsZero() { + ifModifiedSince = strings.Replace(mtime.UTC().Format(time.RFC1123), "UTC", "GMT", 1) } req, err := http.NewRequestWithContext(ctx, "GET", url, http.NoBody) if err != nil { - return false, errors.WithMessagef(err, "could not create HTTP request for %s", url) + err = errors.WithMessagef(err, "could not create HTTP request for %s", url) + + return } req.Header.Set("User-Agent", fmt.Sprintf("Searchix %s", config.ShortSHA)) - if mtime != "" { - req.Header.Set("If-Modified-Since", mtime) + if ifModifiedSince != "" { + req.Header.Set("If-Modified-Since", ifModifiedSince) } res, err := http.DefaultClient.Do(req) if err != nil { - return false, errors.WithMessagef(err, "could not make HTTP request to %s", url) + err = errors.WithMessagef(err, "could not make HTTP request to %s", url) + + return } - defer res.Body.Close() switch res.StatusCode { case http.StatusNotModified: - needed = false + newMtime = mtime + + return case http.StatusOK: - newMtime, err := time.Parse(time.RFC1123, res.Header.Get("Last-Modified")) + newMtime, err = time.Parse(time.RFC1123, res.Header.Get("Last-Modified")) if err != nil { slog.Warn( "could not parse Last-Modified header from response", "value", res.Header.Get("Last-Modified"), ) + newMtime = time.Now() } - err = file.WriteToFile(path, res.Body) - if err != nil { - return false, errors.WithMessagef(err, "could not write response body to file %s", path) - } - err = os.Chtimes(path, time.Time{}, newMtime) - if err != nil { - slog.Warn("could not update mtime on file", "file", path) + + switch ce := res.Header.Get("Content-Encoding"); ce { + case "br": + slog.Debug("using brotli encoding") + body = newBrotliReader(res.Body) + case "", "identity", "gzip": + body = res.Body + default: + err = fmt.Errorf("cannot handle a body with content-encoding %s", ce) } - needed = true default: - return false, fmt.Errorf("got response code %d, don't know what to do", res.StatusCode) + err = fmt.Errorf("got response code %d, don't know what to do", res.StatusCode) } - return needed, nil + return } -- cgit 1.4.1