From fbfec8e800b6956184cb490ba61d9f1943774539 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 5 Jun 2025 20:32:45 +0200
Subject: [PATCH] Add chunker comparison script analyzing BuzHash and BuzHash64
 quality.

---
 scripts/chunker_comparison.py | 453 ++++++++++++++++++++++++++++++++++
 1 file changed, 453 insertions(+)
 create mode 100644 scripts/chunker_comparison.py

diff --git a/scripts/chunker_comparison.py b/scripts/chunker_comparison.py
new file mode 100644
index 000000000..6dd6d3e3a
--- /dev/null
+++ b/scripts/chunker_comparison.py
@@ -0,0 +1,453 @@
+"""
+Chunker Comparison Tool for Borg Backup
+
+This script analyzes and compares the statistical properties of different chunking algorithms
+used in Borg Backup (BuzHash and BuzHash64). It helps evaluate how data is split into chunks
+by each algorithm, which is crucial for deduplication efficiency.
+
+Usage:
+    python scripts/chunker_comparison.py [options]
+
+Options:
+    -g, --graphical       Enable graphical output (requires matplotlib)
+    -o, --output PATH     Output file prefix for saving plots (implies --graphical)
+    -d, --directory PATH  Path to directory containing files to analyze (instead of random data)
+    -s, --size SIZE       Size of random data in MB (default: 100MB, only used when not using --directory)
+
+Examples:
+    # Analyze with 100MB of random data
+    python scripts/chunker_comparison.py
+
+    # Analyze with 500MB of random data
+    python scripts/chunker_comparison.py --size 500
+
+    # Analyze files in a directory and show graphical output
+    python scripts/chunker_comparison.py --directory /path/to/files --graphical
+
+    # Analyze files and save plots to disk
+    python scripts/chunker_comparison.py --directory /path/to/files --output results/chunker_analysis
+"""
+
+import os
+import statistics
+import argparse
+from io import BytesIO
+from collections import defaultdict
+
+from borg.chunkers import Chunker, ChunkerBuzHash64
+
+# Import matplotlib if available
+try:
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    MATPLOTLIB_AVAILABLE = True
+except ImportError:
+    MATPLOTLIB_AVAILABLE = False
+
+
+def analyze_chunker(chunker_class, name, data, min_exp, max_exp, mask_bits, winsize, seed_or_key, do_encrypt=False):
+    """Analyze a chunker's performance on the given data."""
+    chunk_sizes = []
+    kwargs = dict(do_encrypt=do_encrypt) if name.startswith("BuzHash64") else {}
+    chunker = chunker_class(seed_or_key, min_exp, max_exp, mask_bits, winsize, **kwargs)
+    with BytesIO(data) as f:
+        for chunk in chunker.chunkify(f):
+            chunk_sizes.append(chunk.meta["size"])
+
+    if not chunk_sizes:
+        print(f"No chunks were produced by {name}")
+        return None
+
+    # Calculate statistics
+    stats = {
+        "name": name,
+        "count": len(chunk_sizes),
+        "min": min(chunk_sizes) if chunk_sizes else 0,
+        "max": max(chunk_sizes) if chunk_sizes else 0,
+        "mean": statistics.mean(chunk_sizes) if chunk_sizes else 0,
+        "median": statistics.median(chunk_sizes) if chunk_sizes else 0,
+        "std_dev": statistics.stdev(chunk_sizes) if len(chunk_sizes) > 1 else 0,
+        "min_count": sum(int(size == 2**min_exp) for size in chunk_sizes),
+        "max_count": sum(int(size == 2**max_exp) for size in chunk_sizes),
+        "sizes": chunk_sizes,
+    }
+
+    return stats
+
+
+def analyze_chunker_on_files(chunker_class, name, file_paths, min_exp, max_exp, mask_bits, winsize, seed=0):
+    """Analyze a chunker's performance on multiple files individually."""
+    all_chunk_sizes = []
+    total_files_processed = 0
+
+    for file_path in file_paths:
+        try:
+            # Skip empty files
+            if os.path.getsize(file_path) == 0:
+                continue
+
+            # Process this individual file
+            file_chunk_sizes = []
+            chunker = chunker_class(seed, min_exp, max_exp, mask_bits, winsize)
+            with open(file_path, "rb") as f:
+                for chunk in chunker.chunkify(f):
+                    file_chunk_sizes.append(chunk.meta["size"])
+
+            # Add chunk sizes to our collection
+            all_chunk_sizes.extend(file_chunk_sizes)
+
+            total_files_processed += 1
+            print(f"  Processed {file_path}: {len(file_chunk_sizes)} chunks")
+
+        except (IOError, OSError) as e:
+            print(f"  Error processing {file_path}: {e}")
+            continue
+
+    print(f"Total files processed with {name}: {total_files_processed}")
+
+    if not all_chunk_sizes:
+        print(f"No chunks were produced by {name}")
+        return None
+
+    # Calculate statistics
+    stats = {
+        "name": name,
+        "count": len(all_chunk_sizes),
+        "min": min(all_chunk_sizes) if all_chunk_sizes else 0,
+        "max": max(all_chunk_sizes) if all_chunk_sizes else 0,
+        "mean": statistics.mean(all_chunk_sizes) if all_chunk_sizes else 0,
+        "median": statistics.median(all_chunk_sizes) if all_chunk_sizes else 0,
+        "std_dev": statistics.stdev(all_chunk_sizes) if len(all_chunk_sizes) > 1 else 0,
+        "min_count": sum(int(size == 2**min_exp) for size in all_chunk_sizes),
+        "max_count": sum(int(size == 2**max_exp) for size in all_chunk_sizes),
+        "sizes": all_chunk_sizes,
+    }
+
+    return stats
+
+
+def print_stats(stats):
+    """Print statistics for a chunker."""
+    if stats is None:
+        return
+
+    print(f"Chunker: {stats['name']}")
+    print(f"  Number of chunks: {stats['count']}")
+    print(f"  Min chunk size: {stats['min']} bytes")
+    print(f"  Max chunk size: {stats['max']} bytes")
+    print(f"  Mean chunk size: {stats['mean']:.2f} bytes")
+    print(f"  Median chunk size: {stats['median']:.2f} bytes")
+    print(f"  Standard deviation: {stats['std_dev']:.2f} bytes")
+    print(f"  Number of chunks at min size: {stats['min_count']} ({stats['min_count']/stats['count']*100:.2f}%)")
+    print(f"  Number of chunks at max size: {stats['max_count']} ({stats['max_count']/stats['count']*100:.2f}%)")
+    print()
+
+
+def calculate_bucket(size):
+    """Calculate the power-of-2 bucket for a given size."""
+    # Calculate log2 manually
+    bucket = 1
+    while bucket < size:
+        bucket *= 2
+    return bucket
+
+
+def plot_chunk_size_histogram(buzhash_stats, buzhash64_stats, output_file=None):
+    """Plot histogram of chunk sizes for both chunkers."""
+    if not MATPLOTLIB_AVAILABLE:
+        print("Matplotlib is not available. Skipping histogram plot.")
+        return
+
+    plt.figure(figsize=(12, 6))
+
+    # Create histograms with logarithmic bins
+    min_size = min(min(buzhash_stats["sizes"]), min(buzhash64_stats["sizes"]))
+    max_size = max(max(buzhash_stats["sizes"]), max(buzhash64_stats["sizes"]))
+
+    # Create logarithmic bins
+    bins = [2**i for i in range(int(np.log2(min_size)), int(np.log2(max_size)) + 2)]
+
+    plt.hist(buzhash_stats["sizes"], bins=bins, alpha=0.5, label=buzhash_stats["name"])
+    plt.hist(buzhash64_stats["sizes"], bins=bins, alpha=0.5, label=buzhash64_stats["name"])
+
+    plt.xscale("log", base=2)
+    plt.xlabel("Chunk Size (bytes)")
+    plt.ylabel("Frequency")
+    plt.title("Chunk Size Distribution")
+    plt.grid(True, which="both", ls="--", alpha=0.5)
+    plt.legend()
+
+    if output_file:
+        plt.savefig(f"{output_file}_histogram.png")
+    else:
+        plt.show()
+    plt.close()
+
+
+def plot_metrics_comparison(buzhash_stats, buzhash64_stats, output_file=None):
+    """Plot comparison of key metrics between the two chunkers."""
+    if not MATPLOTLIB_AVAILABLE:
+        print("Matplotlib is not available. Skipping metrics comparison plot.")
+        return
+
+    metrics = ["count", "mean", "median", "std_dev"]
+    buzhash_values = [buzhash_stats[m] for m in metrics]
+    buzhash64_values = [buzhash64_stats[m] for m in metrics]
+
+    # Normalize values for better visualization
+    normalized_values = []
+    for i, metric in enumerate(metrics):
+        max_val = max(buzhash_values[i], buzhash64_values[i])
+        normalized_values.append((buzhash_values[i] / max_val, buzhash64_values[i] / max_val))
+
+    plt.figure(figsize=(10, 6))
+
+    x = np.arange(len(metrics))
+    width = 0.35
+
+    plt.bar(x - width / 2, [v[0] for v in normalized_values], width, label=buzhash_stats["name"])
+    plt.bar(x + width / 2, [v[1] for v in normalized_values], width, label=buzhash64_stats["name"])
+
+    # Add actual values as text
+    for i, metric in enumerate(metrics):
+        plt.text(
+            i - width / 2,
+            normalized_values[i][0] + 0.05,
+            f"{buzhash_values[i]:.1f}",
+            ha="center",
+            va="bottom",
+            fontsize=9,
+        )
+        plt.text(
+            i + width / 2,
+            normalized_values[i][1] + 0.05,
+            f"{buzhash64_values[i]:.1f}",
+            ha="center",
+            va="bottom",
+            fontsize=9,
+        )
+
+    plt.xlabel("Metric")
+    plt.ylabel("Normalized Value")
+    plt.title("Comparison of Key Metrics")
+    plt.xticks(x, metrics)
+    plt.legend()
+    plt.grid(True, axis="y", linestyle="--", alpha=0.7)
+
+    if output_file:
+        plt.savefig(f"{output_file}_metrics.png")
+    else:
+        plt.show()
+    plt.close()
+
+
+def plot_bucket_distribution(buzhash_dist, buzhash64_dist, buzhash_stats, buzhash64_stats, output_file=None):
+    """Plot the power-of-2 bucket distribution."""
+    if not MATPLOTLIB_AVAILABLE:
+        print("Matplotlib is not available. Skipping bucket distribution plot.")
+        return
+
+    all_buckets = sorted(set(list(buzhash_dist.keys()) + list(buzhash64_dist.keys())))
+
+    bh_pcts = [
+        buzhash_dist[bucket] / buzhash_stats["count"] * 100 if buzhash_stats["count"] > 0 else 0
+        for bucket in all_buckets
+    ]
+    bh64_pcts = [
+        buzhash64_dist[bucket] / buzhash64_stats["count"] * 100 if buzhash64_stats["count"] > 0 else 0
+        for bucket in all_buckets
+    ]
+
+    plt.figure(figsize=(12, 6))
+
+    x = np.arange(len(all_buckets))
+    width = 0.35
+
+    plt.bar(x - width / 2, bh_pcts, width, label=buzhash_stats["name"])
+    plt.bar(x + width / 2, bh64_pcts, width, label=buzhash64_stats["name"])
+
+    plt.xlabel("Chunk Size Bucket (bytes)")
+    plt.ylabel("Percentage of Chunks")
+    plt.title("Chunk Size Distribution by Power-of-2 Buckets")
+    plt.xticks(x, [f"{b:,}" for b in all_buckets], rotation=45)
+    plt.legend()
+    plt.grid(True, axis="y", linestyle="--", alpha=0.7)
+
+    if output_file:
+        plt.savefig(f"{output_file}_buckets.png")
+    else:
+        plt.show()
+    plt.close()
+
+
+def read_files_from_directory(directory_path):
+    """
+    Recursively find files from a directory.
+
+    Args:
+        directory_path: Path to the directory to read files from
+
+    Returns:
+        list: List of file paths to be processed individually
+    """
+    print(f"Finding files in directory: {directory_path}")
+    file_paths = []
+    total_size = 0
+
+    for root, _, files in os.walk(directory_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            try:
+                # Skip symbolic links, device files, etc.
+                if not os.path.isfile(file_path) or os.path.islink(file_path):
+                    continue
+
+                file_size = os.path.getsize(file_path)
+                # Skip empty files
+                if file_size == 0:
+                    continue
+
+                # Add file path to our list
+                file_paths.append(file_path)
+                total_size += file_size
+                print(f"  Found {file_path} ({file_size/1024:.1f}KB)")
+
+            except (IOError, OSError) as e:
+                print(f"  Error accessing {file_path}: {e}")
+                continue
+
+    print(f"Total found: {len(file_paths)} files, {total_size/1024/1024:.1f}MB from directory {directory_path}")
+    return file_paths
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Analyze and compare Borg chunkers")
+    parser.add_argument("-g", "--graphical", action="store_true", help="Enable graphical output (requires matplotlib)")
+    parser.add_argument(
+        "-o", "--output", type=str, default=None, help="Output file prefix for saving plots (implies --graphical)"
+    )
+    parser.add_argument(
+        "-d",
+        "--directory",
+        type=str,
+        default=None,
+        help="Path to directory containing files to analyze (instead of random data)",
+    )
+    parser.add_argument(
+        "-s",
+        "--size",
+        type=int,
+        default=100,
+        help="Size of random data in MB (default: 100MB, only used when not using --directory)",
+    )
+    args = parser.parse_args()
+
+    # Check if graphical output is requested but matplotlib is not available
+    if (args.graphical or args.output) and not MATPLOTLIB_AVAILABLE:
+        print("Warning: Graphical output requested but matplotlib is not available.")
+        print("Install matplotlib to enable graphical output.")
+        args.graphical = False
+
+    # Configuration parameters
+    min_exp = 19  # Minimum chunk size = 2^min_exp
+    max_exp = 23  # Maximum chunk size = 2^max_exp
+    mask_bits = 21  # Target chunk size = 2^mask_bits
+    winsize = 4095  # Rolling hash window size, must be uneven!
+
+    print("=" * 80)
+    print("BORG CHUNKER STATISTICAL ANALYSIS")
+    print("=" * 80)
+    print("Parameters:")
+    print(f"  minexp={min_exp} (min chunk size: {2**min_exp} bytes)")
+    print(f"  maxexp={max_exp} (max chunk size: {2**max_exp} bytes)")
+    print(f"  maskbits={mask_bits} (target avg chunk size: ~{2**mask_bits} bytes)")
+    print(f"  winsize={winsize}")
+    print("-" * 80)
+
+    # Get data for analysis - either from files or generate random data
+    data_size = args.size * 1024 * 1024  # Convert MB to bytes
+
+    if args.directory:
+        # Get list of files from the specified directory
+        file_paths = read_files_from_directory(args.directory)
+        if not file_paths:
+            print("Error: No files could be found in the specified directory.")
+            return
+
+        # Analyze both chunkers on individual files
+        print("Analyzing chunkers on individual files...")
+        buzhash_stats = analyze_chunker_on_files(Chunker, "BuzHash", file_paths, min_exp, max_exp, mask_bits, winsize)
+        buzhash64_stats = analyze_chunker_on_files(
+            ChunkerBuzHash64, "BuzHash64", file_paths, min_exp, max_exp, mask_bits, winsize
+        )
+    else:
+        # Generate random data
+        print(f"Generating {data_size/1024/1024:.1f}MB of random data...")
+        data = os.urandom(data_size)
+
+        # Analyze both chunkers on random data
+        print("Analyzing chunkers...")
+        seed = 0
+        buzhash_stats = analyze_chunker(
+            Chunker, "BuzHash", data, min_exp, max_exp, mask_bits, winsize, seed_or_key=seed
+        )
+        key = b"0123456789abcdef0123456789abcdef"
+        encrypt = True
+        name = "BuzHash64e" if encrypt else "BuzHash64"
+        buzhash64_stats = analyze_chunker(
+            ChunkerBuzHash64, name, data, min_exp, max_exp, mask_bits, winsize, seed_or_key=key, do_encrypt=encrypt
+        )
+
+    # Print statistics
+    print("\nChunker Statistics:")
+    print_stats(buzhash_stats)
+    print_stats(buzhash64_stats)
+
+    # Compare the chunkers
+    if buzhash_stats and buzhash64_stats:
+        print("Comparison:")
+        print(f"  BuzHash64/BuzHash chunk count ratio: {buzhash64_stats['count']/buzhash_stats['count']:.2f}")
+        print(f"  BuzHash64/BuzHash mean chunk size ratio: {buzhash64_stats['mean']/buzhash_stats['mean']:.2f}")
+        print(f"  BuzHash64/BuzHash std dev ratio: {buzhash64_stats['std_dev']/buzhash_stats['std_dev']:.2f}")
+
+        # Calculate chunk size distribution
+        buzhash_dist = defaultdict(int)
+        buzhash64_dist = defaultdict(int)
+
+        # Group chunk sizes into power-of-2 buckets
+        for size in buzhash_stats["sizes"]:
+            bucket = calculate_bucket(size)
+            buzhash_dist[bucket] += 1
+
+        for size in buzhash64_stats["sizes"]:
+            bucket = calculate_bucket(size)
+            buzhash64_dist[bucket] += 1
+
+        print("\nChunk Size Distribution (power-of-2 buckets):")
+        print("  Size Bucket | BuzHash Count (%) | BuzHash64e Count (%)")
+        print("  -----------|-------------------|-------------------")
+
+        all_buckets = sorted(set(list(buzhash_dist.keys()) + list(buzhash64_dist.keys())))
+        for bucket in all_buckets:
+            bh_count = buzhash_dist[bucket]
+            bh64_count = buzhash64_dist[bucket]
+            bh_pct = bh_count / buzhash_stats["count"] * 100 if buzhash_stats["count"] > 0 else 0
+            bh64_pct = bh64_count / buzhash64_stats["count"] * 100 if buzhash64_stats["count"] > 0 else 0
+            print(f"  {bucket:10d} | {bh_count:5d} ({bh_pct:5.1f}%) | {bh64_count:5d} ({bh64_pct:5.1f}%)")
+
+    # Add a summary of the findings
+    if buzhash_stats and buzhash64_stats:
+        # Generate graphical output if requested
+        if args.graphical or args.output:
+            print("\nGenerating graphical output...")
+            plot_chunk_size_histogram(buzhash_stats, buzhash64_stats, args.output)
+            plot_metrics_comparison(buzhash_stats, buzhash64_stats, args.output)
+            plot_bucket_distribution(buzhash_dist, buzhash64_dist, buzhash_stats, buzhash64_stats, args.output)
+            if args.output:
+                print(f"Plots saved with prefix: {args.output}")
+
+
+if __name__ == "__main__":
+    main()