#!/usr/bin/env python3

import pypdf
import os
import hashlib
import binascii
import json
import argparse
from pypdf import PdfReader, PdfWriter
from pypdf.generic import ByteStringObject, ArrayObject

def split(input_pdf, output_dir, pages, output_prefix = 'document_', output_format = 'json'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    reader = PdfReader(input_pdf)

    # Extract metadata and original ID from the original document to maintain consistency
    info, original_id = get_trailer_fields(reader, input_pdf)

    documents = pages.split(';')
    document_number = 1
    created_files = []

    for document in documents:
        writer = PdfWriter()
        page_numbers = get_page_numbers(document)
        for page_number in page_numbers:
            # Add pages to the new PDF writer, ensuring correct indexing
            writer.add_page(reader.pages[page_number - 1])

        # Generate a new ID for the split document to maintain unique identification
        new_id = generate_new_id(original_id, ','.join(map(str, page_numbers)))
        # Update the trailer with the extracted metadata and new ID to preserve document integrity
        update_trailer(writer, info, new_id)

        # Write the new PDF to the output directory
        output_pdf_path = os.path.join(output_dir, output_prefix + '{0}.pdf'.format(document_number))
        with open(output_pdf_path, 'wb') as out_f:
            writer.write(out_f)
        created_files.append(output_pdf_path)
        document_number += 1

    if output_format == 'json':
        print(json.dumps(created_files, indent=4))
    else:
        for file in created_files:
            print(file)

    return created_files

def merge(output_pdf, input_pdfs):
    writer = PdfWriter()

    for input_pdf in input_pdfs:
        writer.append(input_pdf)

    with open(output_pdf, 'wb') as out_f:
        writer.write(out_f)

def get_metadata(input_pdf, output_format):
    reader = PdfReader(input_pdf)
    info = reader.metadata
    if output_format == 'json':
        print(json.dumps({key: str(value) for key, value in info.items()}, indent=4))
    else:
        for key, value in info.items():
            print(f"{key}: {value}")

def set_metadata(input_pdf, metadata_json, output_pdf):
    reader = PdfReader(input_pdf)
    writer = PdfWriter()
    writer.append_pages_from_reader(reader)

    # keep the original metadata
    metadata = reader.metadata
    writer.add_metadata(metadata)

    metadata = json.loads(metadata_json)
    writer.add_metadata(metadata)

    info, id = get_trailer_fields(reader, input_pdf)
    update_trailer(writer, info, id)

    with open(output_pdf, 'wb') as out_f:
        writer.write(out_f)

def get_page_numbers(document_pages):
    page_numbers = []
    page_ranges = document_pages.split(',')
    for page_range in page_ranges:
        if '-' in page_range:
            start, end = map(int, page_range.split('-'))
            # Extend the list with a range of page numbers for specified ranges
            page_numbers.extend(range(start, end + 1))
        else:
            # Append individual page numbers directly
            page_numbers.append(int(page_range))
    return page_numbers

def get_trailer_fields(reader, input_pdf):
    trailer = reader.trailer
    info = trailer.get('/Info', None)
    id = trailer.get('/ID', None)
    if id:
        # Decode the original ID from binary to hexadecimal string to ensure proper handling
        id = binascii.hexlify(id[0]).decode('utf-8')
    else:
        id = calculate_md5(input_pdf)
    return info, id

def calculate_md5(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as file:
        # Read the file in chunks to avoid memory issues with large files
        for chunk in iter(lambda: file.read(4096), b""):
            hasher.update(chunk)
    return hasher.hexdigest()

def generate_new_id(original_id, document_pages):
    # Concatenate original ID and document pages string to create a unique ID basis
    data = original_id + document_pages
    # Generate MD5 hash to ensure a unique and fixed-length identifier
    md5_hash = hashlib.md5(data.encode('utf-8')).hexdigest()
    # Convert the MD5 hash to binary to match the expected ID format in the PDF trailer
    return binascii.unhexlify(md5_hash.encode('utf-8'))

def update_trailer(writer, info, new_id):
    if info:
        # Add metadata to the new PDF writer to preserve original document information
        writer.add_metadata(info)
    # Update the trailer's ID to ensure the document has a unique identifier
    writer._ID = ArrayObject([ByteStringObject(new_id), ByteStringObject(new_id)])

def number_of_pages(input_pdf):
    reader = PdfReader(input_pdf)
    return len(reader.pages)

def main():
    parser = argparse.ArgumentParser(description='PDF utility tool')
    subparsers = parser.add_subparsers(dest='command', help='Available commands')

    split_parser = subparsers.add_parser('split', help='Split a PDF into multiple documents')
    split_parser.add_argument('--in', dest='input_pdf', required=True, help='Path to the input PDF file')
    split_parser.add_argument('--dir', dest='output_dir', required=True, help='Directory where the output PDFs will be saved')
    split_parser.add_argument('--pages', required=True, help='Pages to include in the output PDFs, e.g., "1-5;6;8;10,12-15"')
    split_parser.add_argument('--prefix', dest='output_prefix', default='document_', help='Prefix for the output PDF files')
    split_parser.add_argument('--format', choices=['text', 'json'], default='json', help='Output format for the list of created files')

    merge_parser = subparsers.add_parser('merge', help='Merge multiple PDFs into one document')
    merge_parser.add_argument('--in', dest='input_pdfs', nargs='+', required=True, help='Paths to the input PDF files to merge')
    merge_parser.add_argument('--out', dest='output_pdf', required=True, help='Path to the output merged PDF file')

    metadata_parser = subparsers.add_parser('metadata', help='Get or set PDF metadata')
    metadata_subparsers = metadata_parser.add_subparsers(dest='metadata_command', help='Metadata commands')

    get_metadata_parser = metadata_subparsers.add_parser('get', help='Get metadata from a PDF')
    get_metadata_parser.add_argument('--in', dest='input_pdf', required=True, help='Path to the input PDF file')
    get_metadata_parser.add_argument('--format', choices=['text', 'json'], default='json', help='Output format for the metadata')

    set_metadata_parser = metadata_subparsers.add_parser('set', help='Set metadata for a PDF')
    set_metadata_parser.add_argument('--in', dest='input_pdf', required=True, help='Path to the input PDF file')
    set_metadata_parser.add_argument('--data', required=True, help='Metadata in JSON format, e.g., \'{"Creator":"My Name", "Title": "document title"}\'')
    set_metadata_parser.add_argument('--out', dest='output_pdf', help='Path to the output PDF file. If not specified, the input file will be updated.')

    number_of_pages_parser = subparsers.add_parser('number-of-pages', help='Get the number of pages in a PDF')
    number_of_pages_parser.add_argument('--in', dest='input_pdf', required=True, help='Path to the input PDF file')

    args = parser.parse_args()

    match args.command:
        case 'split':
            split(args.input_pdf, args.output_dir, args.pages, args.output_prefix, args.format)
        case 'merge':
            merge(args.output_pdf, args.input_pdfs)
        case 'metadata':
            if args.metadata_command == 'get':
                get_metadata(args.input_pdf, args.format)
            elif args.metadata_command == 'set':
                output_pdf = args.output_pdf if args.output_pdf else args.input_pdf
                set_metadata(args.input_pdf, args.data, output_pdf)
        case 'number-of-pages':
            number_of_pages(args.input_pdf)

if __name__ == '__main__':
    main()