mattermost/server/platform/services/docextractor/archive.go
Eva Sarafianou c90dfa4895
Move to library archives (#31100)
* Move to library archives

* wrap the error with %w

Co-authored-by: Jesse Hallam <jesse.hallam@gmail.com>

* remove redundant format

---------

Co-authored-by: Jesse Hallam <jesse.hallam@gmail.com>
2025-06-02 09:21:37 +03:00

101 lines
2.2 KiB
Go

// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package docextractor
import (
"bytes"
"context"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"strings"
"github.com/mholt/archives"
)
type archiveExtractor struct {
SubExtractor Extractor
}
func (ae *archiveExtractor) Name() string {
return "archiveExtractor"
}
func (ae *archiveExtractor) Match(filename string) bool {
_, _, err := archives.Identify(context.Background(), filename, nil)
return err == nil
}
// getExtAlsoTarGz returns the extension of the given file name, special casing .tar.gz.
func getExtAlsoTarGz(name string) string {
if strings.HasSuffix(name, ".tar.gz") {
return ".tar.gz"
}
return filepath.Ext(name)
}
func (ae *archiveExtractor) Extract(name string, r io.ReadSeeker) (string, error) {
ext := getExtAlsoTarGz(name)
// Create a temporary file, using `*` control the random component while preserving the extension.
f, err := os.CreateTemp("", "archiver-*"+ext)
if err != nil {
return "", fmt.Errorf("error creating temporary file: %v", err)
}
defer os.Remove(f.Name())
_, err = io.Copy(f, r)
f.Close()
if err != nil {
return "", fmt.Errorf("error copying data into temporary file: %v", err)
}
var text strings.Builder
fsys, err := archives.FileSystem(context.Background(), f.Name(), nil)
if err != nil {
return "", fmt.Errorf("error creating file system: %w", err)
}
err = fs.WalkDir(fsys, ".", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
text.WriteString(path + " ")
if ae.SubExtractor != nil {
filename := filepath.Base(path)
filename = strings.ReplaceAll(filename, "-", " ")
filename = strings.ReplaceAll(filename, ".", " ")
filename = strings.ReplaceAll(filename, ",", " ")
file, err := fsys.Open(path)
if err != nil {
return err
}
defer file.Close()
data, err := io.ReadAll(file)
if err != nil {
return err
}
subtext, extractErr := ae.SubExtractor.Extract(filename, bytes.NewReader(data))
if extractErr == nil {
text.WriteString(subtext + " ")
}
}
return nil
})
if err != nil {
return "", err
}
return text.String(), nil
}