git @ Cat's Eye Technologies The-Dipple / master python / collect_sources
master

Tree @master (Download .tar.gz)

collect_sources @masterraw · history · blame

#!/usr/bin/env python3

# SPDX-FileCopyrightText: Chris Pressey, the creator of this work, has dedicated it to the public domain.
# For more information, please refer to <https://unlicense.org/>
# SPDX-License-Identifier: Unlicense

import os
import sys
import argparse
from pathlib import Path
from typing import Iterator, Set, List


def parse_args() -> argparse.Namespace:
    """
    Parse and return command line arguments.
    """
    parser = argparse.ArgumentParser(
        description='Traverse directory trees and output contents of text files found therein.'
    )
    parser.add_argument(
        'directories',
        nargs='+',
        type=Path,
        help='Directories to process'
    )
    parser.add_argument(
        '--exclude-dir',
        action='append',
        default=[],
        help='Directory names to exclude (can be specified multiple times)'
    )
    return parser.parse_args()


def is_text_file(file_path: Path) -> bool:
    """
    Determine if a file is likely a text file by attempting to read it as UTF-8.
    Returns True if file can be read as text, False otherwise.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            f.read(1024)  # Try reading first 1KB
        return True
    except UnicodeDecodeError:
        return False
    except Exception:
        return False


def should_skip_path(path: Path, excluded_dirs: Set[str]) -> bool:
    """
    Determine if a path should be skipped based on exclusion rules.
    Returns True if path should be skipped, False otherwise.
    """
    # Skip hidden files and directories (starting with '.')
    if path.name.startswith('.'):
        return True

    # Skip excluded directory names
    if path.name in excluded_dirs:
        return True

    # Skip symlinks
    if path.is_symlink():
        return True

    return False


def find_text_files(directory: Path, excluded_dirs: Set[str]) -> Iterator[Path]:
    """
    Recursively find all text files in the given directory.
    Skips symlinks, hidden files/dirs, and excluded directories.
    Yields Path objects for each text file found.
    """
    try:
        for path in directory.iterdir():
            # Check if path should be skipped
            if should_skip_path(path, excluded_dirs):
                continue

            if path.is_file() and is_text_file(path):
                yield path
            elif path.is_dir():
                yield from find_text_files(path, excluded_dirs)

    except PermissionError as e:
        print(f"Permission denied accessing {e.filename}", file=sys.stderr)
    except Exception as e:
        print(f"Error accessing path: {e}", file=sys.stderr)


def print_file_contents(file_path: Path) -> None:
    """
    Print a header containing the file path, followed by its contents.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            print(f"\n{'#' * 10} {file_path}\n")
            print(f.read())
    except Exception as e:
        print(f"Error reading {file_path}: {e}", file=sys.stderr)


def process_directory(directory: Path, excluded_dirs: Set[str]) -> None:
    """
    Process a single directory, printing contents of all text files found.
    """
    if not directory.exists():
        print(f"Error: {directory} does not exist", file=sys.stderr)
        return
    if not directory.is_dir():
        print_file_contents(directory)
        return

    try:
        for file_path in find_text_files(directory, excluded_dirs):
            print_file_contents(file_path)
    except Exception as e:
        print(f"Error processing directory {directory}: {e}", file=sys.stderr)


def main():
    args = parse_args()

    # Convert excluded_dirs to a set for faster lookup
    excluded_dirs = set(args.exclude_dir)

    # Process each directory in turn
    for directory in args.directories:
        process_directory(directory, excluded_dirs)


if __name__ == '__main__':
    main()