<?php

namespace Bespin\DocumentClient\Parser;

use Bespin\DocumentClient\DocumentType\DatevHealthInsurance;
use Bespin\DocumentClient\DocumentType\DatevHealthInsuranceAllocation;
use Bespin\DocumentClient\DocumentType\DatevProtocolPayrollDeclaration;
use Bespin\DocumentClient\DocumentType\DatevProtocolStatementOfContribution;
use Bespin\DocumentClient\DocumentType\DatevWageJournal;
use Bespin\DocumentClient\DocumentType\DatevWageJournalAnnual;
use Bespin\DocumentType\DocumentType;
use Bespin\DocumentClient\File\PdfFile;
use Bespin\DocumentClient\DocumentType\Document;
use Bespin\DocumentClient\DocumentType\Payroll;
use Bespin\DocumentClient\DocumentType\WageTax;
use Bespin\DocumentClient\Model\ParserModelInterface;
use DateTime;
use DateTimeZone;
use Exception;

class DatevParser
{
    private PdfFile $pdfDocument;

    public function __construct(PdfFile $pdfDocument)
    {
        $this->pdfDocument = $pdfDocument;
    }

    /**
     * @return array<string, DocumentType>
     */
    public function determineDocumentType(string $content = ''): array
    {
        if ($content === '') {
            $content = $this->pdfDocument->getText();
        }
        $compressedLowerContent = strtolower(preg_replace('/[\t ]+/', '', $content) ?? '');
        $result                 = [];
        if (str_contains($compressedLowerContent, 'logn15')) {
            // That string is the template name for payroll (DATEV)
            $result[DocumentType::PAYROLL->value] = DocumentType::PAYROLL;
        }
        if (str_contains($compressedLowerContent, 'lo4725') && str_contains($compressedLowerContent, 'lohnsteuerbescheinigung')) {
            // That string is the template name for wage tax certification (DATEV)
            $result[DocumentType::WAGE_TAX->value] = DocumentType::WAGE_TAX;
        }
        return $result;
    }

    /**
     * @param ParserModelInterface $parserModel
     * @return array<int, Document>
     * @throws Exception
     */
    public function getDocuments(ParserModelInterface $parserModel): array
    {
        $pages          = $this->pdfDocument->splitDocumentGetPages();
        $result         = [];
        $remainingPages = [];
        $pageRanges     = [];
        foreach ($pages as $page) {
            $textContent    = $this->lowerCaseNoBlanksString($page->getText());
            $identifiedPage = null;
            $type           = '';
            switch (true) {
                case str_contains($textContent, 'form.-nr.logn15'):
                    $identifiedPage = $this->parseLOGN15($page, $parserModel);
                    break;
                case str_contains($textContent, 'form.-nr.lo4725'):
                    $identifiedPage = $this->parseLO4725($page, $parserModel);
                    break;
                case str_contains($textContent, 'form.-nr.lok111'):
                    $identifiedPage = $this->parseLOK111($page);
                    $type           = 'lok111';
                    break;
                case str_contains($textContent, 'form.-nr.loqn31'):
                    $identifiedPage = $this->parseLOGN31($page);
                    $type           = 'loqn31';
                    break;
                case str_contains($textContent, 'form.-nr.loa312'):
                    $identifiedPage = $this->parseLOA312($page);
                    if ($identifiedPage instanceof DatevWageJournalAnnual) {
                        $type = 'loa312_annual';
                    } else {
                        $type = 'loa312';
                    }
                    break;
                case str_contains($textContent, 'form.-nr.lohn31'):
                    $identifiedPage = $this->parseLOHN31($page);
                    $type           = 'lohn31';
                    break;
                case str_contains($textContent, 'form.nr.loa104'):
                    $identifiedPage = $this->parseLOA104($page);
                    $type           = 'loa104';
                    break;
                default:
                    print $page->getText();
            }
            if ($identifiedPage !== null) {
                if ($type !== '') {
                    $pageRanges[$type][$page->getParentPageNumber()] = $identifiedPage;
                } else {
                    $result[] = $identifiedPage;
                }
            } else {
                $remainingPages[$page->getParentPageNumber()] = $page;
            }
        }
        foreach ($pageRanges as $type => $range) {
            if (!empty($range)) {
                $documentDate = $range[array_key_first($range)]->getDocumentDate();
                $document     = $this->pdfDocument->getPageRange(array_keys($range));
                $document->setDocumentDate($documentDate);
                $result[] = match ($type) {
                    'lok111'        => new DatevHealthInsurance($document, $documentDate),
                    'loqn31'        => new DatevHealthInsuranceAllocation($document, $documentDate),
                    'loa312'        => new DatevWageJournal($document, $documentDate),
                    'loa312_annual' => new DatevWageJournalAnnual($document, $documentDate),
                    'lohn31'        => new DatevProtocolPayrollDeclaration($document, $documentDate),
                    'loa104'        => new DatevProtocolStatementOfContribution($document, $documentDate),
                    default         => new Document($document),
                };
            }
        }
        if (!empty($remainingPages)) {
            $result[] = new Document($this->pdfDocument->getPageRange($remainingPages));
        }
        return $result;
    }


    private function parseLO4725(PdfFile $page, ParserModelInterface $parserModel): ?Document
    {
        $employeeNumber = $this->parseEmployeeNumber($page);
        $employee       = $parserModel->getEmployee($employeeNumber);
        if ($employee !== null) {
            $content = strtolower(str_replace(' ', '', $page->getText()));
            // check if employee number and name are matching
            if (str_contains($content, strtolower($employee->firstName)) && str_contains($content, strtolower($employee->lastName))) {
                // determine document date
                $matches = [];

                $pattern = '/(?:-lst-.*?)'    // Non-capturing group: -lst- followed by any characters
                    .'((19|20)\d\d'           // Capturing group: Year 1900-2099
                    .'(0[1-9]|1[0-2])'        // Month 01-12
                    .'(0[1-9]|[12]\d|3[01])-' // Day 01-31
                    .'([01]\d|2[0-3])'        // Hour 00-23
                    .'([0-5]\d)'              // Minute 00-59
                    .'([0-5]\d))/';           // Second 00-59

                preg_match($pattern, $content, $matches);
                if (count($matches) > 0) {
                    try {
                        $documentDate = DateTime::createFromFormat('Ymd-His', $matches[1], new DateTimeZone('Europe/Berlin'));
                    } catch (Exception) {
                        $documentDate = null;
                    }
                    $content = strtolower(str_replace(' ', '', $content));
                    $needle  = 'ausdruckderelektronischenlohnsteuerbescheinigungfür';
                    $start   = strpos($content, $needle);
                    if ($start !== false) {
                        if ($documentDate !== null) {
                            $page->setDocumentDate($documentDate);
                        }
                        $contentYear = DateTime::createFromFormat('!Y', substr($content, ($start + strlen($needle)), 4));
                        $wageTax     = new WageTax($page, $employee, $contentYear);
                        $wageTax->setDocumentDate($documentDate);
                        return $wageTax;
                    }
                }
            }
        }
        return null;
    }

    private function parseLOGN15(PdfFile $page, ParserModelInterface $parserModel): ?Document
    {
        $employeeNumber = $this->parseEmployeeNumber($page);
        $employee       = $parserModel->getEmployee($employeeNumber);
        if ($employee !== null) {
            $content = strtolower(str_replace(' ', '', $page->getText()));
            if (str_contains($content, strtolower($employee->firstName)) && str_contains($content, strtolower($employee->lastName))) {
                $loBuMonth = 0;
                $loBuYear  = 0;
                $date      = '';
                $months    = [
                    '01' => 'fürjanuar',
                    '02' => 'fürfebruar',
                    '03' => 'fürmärz',
                    '04' => 'fürapril',
                    '05' => 'fürmai',
                    '06' => 'fürjuni',
                    '07' => 'fürjuli',
                    '08' => 'füraugust',
                    '09' => 'fürseptember',
                    '10' => 'füroktober',
                    '11' => 'fürnovember',
                    '12' => 'fürdezember'
                ];
                foreach ($months as $month => $needle) {
                    if (str_contains($content, $needle)) {
                        $loBuMonth = $month;
                        [$loBuYear, $date, $suffix] = $this->getPageData($content, $needle);
                        break;
                    }
                }
                if ($loBuMonth === 0) {
                    $months = [
                        '01' => 'fürjan',
                        '02' => 'fürfeb',
                        '03' => 'fürmrz',
                        '04' => 'fürapr',
                        '05' => 'fürmai',
                        '06' => 'fürjun',
                        '07' => 'fürjul',
                        '08' => 'füraug',
                        '09' => 'fürsep',
                        '10' => 'fürokt',
                        '11' => 'fürnov',
                        '12' => 'fürdez'
                    ];
                    foreach ($months as $month => $needle) {
                        if (str_contains($content, $needle)) {
                            $loBuMonth = $month;
                            [$loBuYear, $date, $suffix] = $this->getPageData($content, $needle);
                            break;
                        }
                    }
                }

                if ($date !== '') {
                    try {
                        $documentDate = new DateTime(trim($date), new DateTimeZone('Europe/Berlin'));
                    } catch (Exception) {
                        $documentDate = null;
                    }
                    if ($documentDate instanceof DateTime) {
                        $page->setDocumentDate($documentDate);
                    }
                    if ($loBuMonth > 0 && $loBuYear > 0 && $documentDate !== null) {
                        $titleDate = DateTime::createFromFormat('!Y-m-d', $loBuYear.'-'.$loBuMonth.'-01');
                        return new Payroll($page, $employee, $titleDate, $suffix);
                    }
                }
            }
        }
        return null;
    }

    private function parseLOK111(PdfFile $page): ?Document
    {
        $documentDate = $this->getDate($page->getText());
        $page->setDocumentDate($documentDate);
        $document = new DatevHealthInsurance($page, $documentDate);
        $document->setDocumentDate($documentDate);
        return $document;
    }

    private function parseLOGN31(PdfFile $page): ?Document
    {
        $documentDate = $this->getDate($page->getText());
        $page->setDocumentDate($documentDate);
        $document = new DatevHealthInsuranceAllocation($page, $documentDate);
        $document->setDocumentDate($documentDate);
        return $document;
    }

    private function parseLOA312(PdfFile $page): ?Document
    {
        $documentDate = $this->getDate($page->getText());
        $page->setDocumentDate($documentDate);
        if (str_contains(strtolower($page->getText()), 'jahreswerte')) {
            $document = new DatevWageJournalAnnual($page, $documentDate);
        } else {
            $document = new DatevWageJournal($page, $documentDate);
        }
        $document->setDocumentDate($documentDate);
        return $document;
    }

    private function parseLOHN31(PdfFile $page): ?Document
    {
        $documentDate = $this->getDate($page->getText());
        $page->setDocumentDate($documentDate);
        $document = new DatevProtocolPayrollDeclaration($page, $documentDate);
        $document->setDocumentDate($documentDate);
        return $document;
    }

    private function parseLOA104(PdfFile $page): ?Document
    {
        $documentDate = $this->getDate($page->getText());
        $page->setDocumentDate($documentDate);
        $document = new DatevProtocolStatementOfContribution($page, $documentDate);
        $document->setDocumentDate($documentDate);
        return $document;
    }

    private function getPageData(string $content, string $needle): array
    {
        $pos     = strpos($content, $needle) + strlen($needle);
        $line    = substr($content, $pos, (strpos($content, PHP_EOL, $pos)) - $pos);
        $matches = [];
        preg_match('/\d/', $line, $matches, PREG_OFFSET_CAPTURE);
        $loBuYear = substr($line, $matches[0][1], 4);
        $line     = substr($line, $matches[0][1] + 4);
        $suffix   = '';
        if (str_contains($line, '(') && str_contains($line, ')')) {
            $pos    = strpos($line, '(') + 1;
            $suffix = substr($line, $pos, strpos($line, ')') - $pos);
        }
        $dateMatches = [];
        preg_match('/(0[1-9]|[12]\d|3[01])\.(0[1-9]|1[0-2])\.(19|2\d\d{2})/', $line, $dateMatches);
        if (count($dateMatches) === 4) {
            $date = $dateMatches[0];
        } else {
            throw new Exception('Couldn\'t determine page date');
        }
        return [$loBuYear, $date, $suffix];
    }

    private function lowerCaseNoBlanksString(string $text): string
    {
        return strtolower(preg_replace('/[\t ]+/', '', $text));
    }

    private function parseEmployeeNumber(PdfFile $pdf): int
    {
        $text  = strtolower(preg_replace('/[\t ]+/', '', $pdf->getText()));
        $start = strpos($text, '*pers.-nr.') + 10;
        $end   = strpos($text, '*', $start);
        return (int)substr($text, $start, $end - $start);
    }

    private function getDate(string $text): ?DateTime
    {
        $dateMatches = [];
        preg_match('/(0[1-9]|[12]\d|3[01])\.(0[1-9]|1[0-2])\.(19|2\d\d{2})/', $text, $dateMatches);
        if (count($dateMatches) === 4) {
            try {
                $date = new DateTime($dateMatches[0], new DateTimeZone('Europe/Berlin'));
            } catch (Exception) {
                $date = null;
            }
            return $date;
        } else {
            return null;
        }
    }
}
