#!/usr/bin/env python3
"""
CSV Column Remapper for Key-Value Pair Data
Processes CSV files where identifiers and values alternate in columns.

Supports BOTH:
  - Numeric identifiers (e.g., 1200319923593372)
  - Text markers (e.g., due_on:, name:)

Example input format:
  task_id, identifier1, value1, due_on:, 2026-03-20, name:, Task Name...
  1213413361584562, 1200319923593372, Yes, due_on:, 2026-03-20, name:, Dean Away

Usage:
  python csv_remapper.py input.csv output.csv --map id1:label1 "due_on:":due_date ...
"""

import csv
import re
import argparse
from pathlib import Path
from collections import OrderedDict


def parse_key_value_csv(input_file, output_file, identifier_labels, include_all=False):
    """
    Parse CSV where identifiers and values alternate in columns.

    Args:
        input_file: Path to input CSV
        output_file: Path to output CSV
        identifier_labels: Dict mapping {identifier: label}
                          For text markers, include the colon (e.g., "due_on:")
        include_all: If True, include unmapped fields (default: False)

    Returns:
        List of processed row dictionaries
    """

    with open(input_file, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        raw_rows = list(reader)

    if not raw_rows:
        raise ValueError("Input file is empty")

    # Skip header row if it looks like column numbers (0, 1, 2...)
    first_row = raw_rows[0]
    if all(re.match(r'^\d+$', str(cell).strip()) for cell in first_row[:5]):
        data_rows = raw_rows[1:]
    else:
        data_rows = raw_rows

    processed_rows = []
    extra_fields = set()

    for row in data_rows:
        # Clean whitespace from all cells
        row = [cell.strip() if cell else '' for cell in row]

        if not row or not row[0]:
            continue

        task_id = row[0]

        # Build record - task_id always first
        record = OrderedDict()
        record['task_id'] = task_id

        # Add mapped fields
        for label in identifier_labels.values():
            record[label] = ''

        # Parse identifier-value pairs
        i = 1
        while i < len(row) - 1:
            current = row[i]
            next_val = row[i + 1] if i + 1 < len(row) else ''

            if not current:
                i += 1
                continue

            # Check if current is a numeric identifier (10+ digits)
            if re.match(r'^\d{10,}$', current):
                value = next_val if next_val not in ['None', ''] else ''

                if current in identifier_labels:
                    record[identifier_labels[current]] = value
                elif include_all:
                    field_name = f"field_{current}"
                    record[field_name] = value
                    extra_fields.add(field_name)
                i += 2

            # Check if current is a text marker ending with colon
            elif current.endswith(':'):
                value = next_val if next_val not in ['None', ''] else ''

                # Check if this marker (with colon) is in our mapping
                if current in identifier_labels:
                    record[identifier_labels[current]] = value
                elif include_all:
                    # Store without colon for cleaner column name
                    marker_name = current.rstrip(':')
                    record[marker_name] = value
                    extra_fields.add(marker_name)
                i += 2
            else:
                i += 1

        processed_rows.append(record)

    # Build fieldnames
    fieldnames = ['task_id'] + list(identifier_labels.values())
    if include_all:
        fieldnames += sorted(extra_fields)

    # Ensure all records have all fields
    for record in processed_rows:
        for fname in fieldnames:
            if fname not in record:
                record[fname] = ''

    # Write output
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(processed_rows)

    print(f"✓ Processed {len(processed_rows)} rows")
    print(f"✓ Output columns ({len(fieldnames)}): {fieldnames}")
    print(f"✓ Output saved to: {output_file}")

    return processed_rows


def main():
    parser = argparse.ArgumentParser(
        description='Remap CSV columns from identifier-value pairs. Supports numeric IDs and text markers (e.g., due_on:)',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Examples:
  # Numeric identifiers only
  python csv_remapper.py input.csv output.csv --map 1200319923593372:status 1189842108382269:department

  # Include text markers (use quotes for colons)
  python csv_remapper.py input.csv output.csv \
    --map 1200319923593372:status "due_on:":due_date "name:":task_name

  # Mixed identifiers and markers
  python csv_remapper.py input.csv output.csv \
    --map 1200319923593372:status 1206595535112943:assigned_to "due_on:":due_date "name:":task_name
        '''
    )

    parser.add_argument('input_file', help='Input CSV file path')
    parser.add_argument('output_file', help='Output CSV file path')
    parser.add_argument(
        '--map', '-m',
        nargs='+',
        required=True,
        help='Identifier-label pairs in format id:label. For text markers, include colon in quotes (e.g., "due_on:":due_date)'
    )
    parser.add_argument(
        '--include-all', '-a',
        action='store_true',
        help='Include unmapped fields (default: only mapped fields)'
    )

    args = parser.parse_args()

    # Validate input file
    if not Path(args.input_file).exists():
        print(f"Error: Input file '{args.input_file}' not found")
        return 1

    # Parse identifier:label pairs
    identifier_labels = OrderedDict()
    for pair in args.map:
        if ':' not in pair:
            print(f"Error: Invalid mapping '{pair}'. Use format 'identifier:label'")
            return 1

        # Handle text markers that end with colon
        # Format: "due_on:":due_date or due_on::due_date
        if pair.startswith('"') and '":' in pair:
            # Quoted format: "due_on:":due_date
            parts = pair.split('":', 1)
            identifier = parts[0][1:] + ':'  # Remove opening quote, add colon back
            label = parts[1].strip()
        elif pair.count(':') == 2 and not pair.startswith('"'):
            # Unquoted with double colon: due_on::due_date
            parts = pair.rsplit(':', 1)
            identifier = parts[0] + ':'
            label = parts[1].strip()
        else:
            # Standard format: 1234567890:label
            identifier, label = pair.split(':', 1)
            identifier = identifier.strip()

        identifier_labels[identifier] = label.strip()

    try:
        parse_key_value_csv(args.input_file, args.output_file, identifier_labels, args.include_all)
    except Exception as e:
        print(f"Error processing file: {e}")
        return 1

    return 0


# Default configuration for your specific data
DEFAULT_IDENTIFIERS = [
    "1200319923593372",   # status: Yes, Processing, On Order
    "1200319923593381",   # approval status
    "1201654500615241",   # category
    "1189842108382269",   # department: Workshop, Pick Up, Air Seeder
    "1206425158320994",   # location
    "1206595535112943",   # assigned_to: Brad, Nikita, Jenszy
    "due_on:",            # due date
    "name:",              # task name
]

DEFAULT_LABELS = [
    "status",
    "approval", 
    "category",
    "department",
    "location",
    "assigned_to",
    "due_date",
    "task_name",
]


if __name__ == '__main__':
    import sys
    sys.exit(main())
