Add dbscanner tool

pskopnik · pskopnik · commit 53a961d15a98 · 2023-05-22T16:54:18.000+02:00
diff --git a/meda/file.go b/meda/file.go
@@ -2,6 +2,8 @@ package meda
 
 import (
 	"context"
+	"fmt"
+	"io"
 
 	"github.com/jmoiron/sqlx"
 	"github.com/pkg/errors"
@@ -419,3 +421,159 @@ func (d *DB) FilesToBeReadFetcher(config *FilesToBeReadFetcherConfig) FilesToBeR
 		config: config,
 	}
 }
+
+const filesIteratorFetchQuery = GenericQuery(`
+	SELECT
+		id, rand, path, file_size
+	FROM {FILES}
+		WHERE
+				id > ?
+			AND
+				id <= ?
+		ORDER BY id ASC
+		LIMIT ?
+	;
+`)
+
+const filesIteratorNextChunkQuery = GenericQuery(`
+	(
+		SELECT
+			id
+		FROM {FILES}
+			WHERE id > ?
+			ORDER BY id ASC
+			LIMIT ?
+	)
+		ORDER BY id DESC LIMIT 1;
+`)
+
+type FilesIteratorConfig struct {
+	ChunkSize uint64
+	BatchSize uint64
+}
+
+type FilesIterator struct {
+	ctx    context.Context
+	db     *DB
+	config FilesIteratorConfig
+	// it is the ChunkIterator used to limit all queries to only a range
+	// of rows.
+	// multiple rows.
+	chunkIt           ChunkIterator
+	chunkContainsRows bool
+	lastID            uint64
+
+	completed bool
+
+	err   error
+	batch []File
+	i     int
+}
+
+func (f *FilesIterator) Next() bool {
+	if f.err != nil {
+		return false
+	}
+
+	if f.i+1 < len(f.batch) {
+		f.i++
+	} else if f.completed {
+		return false
+	} else {
+		err := f.fetchNextBatch()
+		if err == io.EOF {
+			f.completed = true
+			return false
+		} else if err != nil {
+			f.err = err
+			return false
+		}
+		f.i = 0
+	}
+
+	return true
+}
+
+func (f *FilesIterator) Element() *File {
+	return &f.batch[f.i]
+}
+
+func (f *FilesIterator) Error() error {
+	return f.err
+}
+
+func (f *FilesIterator) initialise() {
+	f.chunkIt = ChunkIterator{
+		ChunkSize:      f.config.ChunkSize,
+		NextChunkQuery: filesIteratorNextChunkQuery.SubstituteAll(f.db),
+	}
+}
+
+func (f *FilesIterator) fetchNextBatch() error {
+	if f.chunkIt.NextChunkQuery == "" {
+		f.initialise()
+	}
+
+	queryLimit := f.config.BatchSize
+
+	for {
+		if !f.chunkContainsRows {
+			ok, err := f.advanceToNextChunk()
+			if err != nil {
+				return fmt.Errorf("(*FilesIterator).fetchNextBatch: %w", err)
+			} else if !ok {
+				return io.EOF
+			}
+		}
+
+		rows, err := f.db.QueryxContext(
+			f.ctx,
+			filesIteratorFetchQuery.SubstituteAll(f.db),
+			f.lastID,
+			f.chunkIt.LastID(),
+			queryLimit,
+		)
+		if err != nil {
+			return fmt.Errorf("(*FilesIterator).fetchNextBatch: querying db: %w", err)
+		}
+
+		f.batch, err = filesAppendFromRowsAndClose(f.batch[:0], rows)
+		if err != nil {
+			return fmt.Errorf("(*FilesIterator).fetchNextBatch: %w", err)
+		}
+
+		if uint64(len(f.batch)) < queryLimit {
+			// less rows returned than requested as chunk is exhausted
+			f.chunkContainsRows = false
+		}
+
+		if len(f.batch) > 0 {
+			// at least one file was fetched
+
+			f.lastID = f.batch[len(f.batch)-1].ID
+
+			return nil
+		}
+	}
+}
+
+func (f *FilesIterator) advanceToNextChunk() (bool, error) {
+	if !f.chunkIt.Next(f.ctx, f.db) {
+		if f.chunkIt.Err() != nil {
+			return false, fmt.Errorf("(*FilesIterator).advanceToNextChunk: %w", f.chunkIt.Err())
+		}
+		// no more chunks, chunkIterator exhausted
+		return false, nil
+	}
+
+	f.chunkContainsRows = true
+	return true, nil
+}
+
+func (d *DB) FilesIterator(ctx context.Context, config FilesIteratorConfig) FilesIterator {
+	return FilesIterator{
+		ctx:    ctx,
+		db:     d,
+		config: config,
+	}
+}
diff --git a/tools/dbfilescanner/README.md b/tools/dbfilescanner/README.md
@@ -0,0 +1,13 @@
+# dbfilescanner
+
+This tool reads the files table in the database once.
+It can be used to efficiently read the entire table into the InnoDB buffer pool (in-memory cache).
+It also serves as a basis to run custom sequential analysis on the data.
+
+As its first and only argument it expects a configuration file like for the lsdf-checksum-master.
+Only the `db` stanza from the config is used.
+
+```shell
+$ go build .
+$ ./dbfilescanner config.yaml
+```
diff --git a/tools/dbfilescanner/main.go b/tools/dbfilescanner/main.go
@@ -0,0 +1,83 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"os"
+
+	"gopkg.in/yaml.v3"
+
+	"git.scc.kit.edu/sdm/lsdf-checksum/meda"
+)
+
+type Config struct {
+	DB meda.Config
+}
+
+func readConfig(path string) (*Config, error) {
+	configFile, err := os.Open(path)
+	if err != nil {
+		panic(err)
+	}
+	defer configFile.Close()
+
+	config := &Config{}
+
+	dec := yaml.NewDecoder(configFile)
+	err = dec.Decode(config)
+	if err != nil {
+		return nil, err
+	}
+
+	return config, nil
+}
+
+func scanFiles(db *meda.DB) error {
+	it := db.FilesIterator(context.Background(), meda.FilesIteratorConfig{
+		ChunkSize: 100000,
+		BatchSize: 10000,
+	})
+
+	var fileCount, lastLogFileCount, fileTotalSize uint64
+
+	for it.Next() {
+		fileTotalSize += it.Element().FileSize
+		fileCount += 1
+
+		if lastLogFileCount+1000000 <= fileCount {
+			log.Printf("Read %d files so far...", fileCount)
+			lastLogFileCount = fileCount
+		}
+	}
+	if it.Error() != nil {
+		return it.Error()
+	}
+
+	log.Printf("Completed reading")
+	log.Printf("Read %d files with total size %d", fileCount, fileTotalSize)
+
+	return nil
+}
+
+func main() {
+	if len(os.Args) != 2 {
+		fmt.Println("Usage:", os.Args[0], "<config.yaml>")
+		os.Exit(1)
+	}
+
+	config, err := readConfig(os.Args[1])
+	if err != nil {
+		panic(err)
+	}
+
+	db, err := meda.Open(meda.DefaultConfig.Clone().Merge(&config.DB))
+	if err != nil {
+		panic(err)
+	}
+
+	err = scanFiles(db)
+	if err != nil {
+		panic(err)
+	}
+}