zig-fetch-py/zig_fetch_py/parser.py

"""
ZON parser module - Parses Zig Object Notation (ZON) files.
"""

import json
from typing import Any, Dict, List, Union, Optional

from loguru import logger


class ZonParser:
    """
    A parser for Zig Object Notation (ZON) files.
    """

    _content: str
    _pos: int
    _line: int
    _col: int
    empty_tuple_as_dict: bool = False

    def __init__(self, content: str, empty_tuple_as_dict: bool = False):
        """
        Initialize the parser with ZON content.

        Args:
            content: The ZON content to parse
            empty_tuple_as_dict: If True, empty tuples (.{}) will be parsed as empty dictionaries ({})
                               If False, empty tuples will be parsed as empty lists ([])
        """
        self._content = content
        self._pos = 0
        self._line = 1
        self._col = 1
        self.empty_tuple_as_dict = empty_tuple_as_dict

    def parse(self) -> Dict[str, Any]:
        """Parse ZON content and return a Python dictionary."""
        result = self._parse_value()
        return result

    def _current_char(self) -> str:
        if self._pos >= len(self._content):
            return ""
        return self._content[self._pos]

    def _next_char(self) -> str:
        self._pos += 1
        if self._pos - 1 < len(self._content):
            char = self._content[self._pos - 1]
            if char == "\n":
                self._line += 1
                self._col = 1
            else:
                self._col += 1
            return char
        return ""

    def _skip_whitespace_and_comments(self):
        while self._pos < len(self._content):
            char = self._current_char()

            # Skip whitespace
            if char.isspace():
                self._next_char()
                continue

            # Skip comments
            if (
                char == "/"
                and self._pos + 1 < len(self._content)
                and self._content[self._pos + 1] == "/"
            ):
                # Skip to end of line
                while self._pos < len(self._content) and self._current_char() != "\n":
                    self._next_char()
                continue

            break

    def _parse_value(self) -> Any:
        """Parse a ZON value."""
        self._skip_whitespace_and_comments()

        char = self._current_char()

        if char == ".":
            self._next_char()  # Skip the dot

            # Check if it's an object or tuple
            if self._current_char() == "{":
                return self._parse_object()

            # It's a field name or a special value
            return self._parse_identifier()

        elif char == '"':
            return self._parse_string()
        elif char.isdigit() or char == "-":
            return self._parse_number()
        elif char == "t" or char == "f":
            return self._parse_boolean()
        elif char == "n" and self._content[self._pos : self._pos + 4] == "null":
            self._pos += 4
            return None
        else:
            raise ValueError(
                f"Unexpected character '{char}' at line {self._line}, column {self._col}"
            )

    def _parse_object(self) -> Union[Dict[str, Any], List[Any]]:
        """Parse a ZON object or tuple."""
        # Skip the opening brace
        self._next_char()

        # Look ahead to see if this is a tuple or an object
        pos_before = self._pos
        line_before = self._line
        col_before = self._col

        self._skip_whitespace_and_comments()

        # Check if it's empty
        if self._current_char() == "}":
            # Need to determine if it should be an empty object or empty tuple
            # Use the configuration option to decide
            self._next_char()  # Skip the closing brace
            return (
                {} if self.empty_tuple_as_dict else []
            )  # Empty dict or list based on config

        # Look at the first character to determine if it's a tuple or object
        is_tuple = True
        if self._current_char() == ".":
            # Look ahead one more character
            self._next_char()
            # If the next character is an object, it could be a nested tuple
            if self._current_char() == "{":
                # This is potentially a nested tuple starting with .{
                # Go back to the dot and let the normal parsing decide
                self._pos -= 1
            elif (
                self._current_char() == "@"
                or self._current_char().isalnum()
                or self._current_char() == "_"
            ):
                # This looks like a field name, so it's probably an object
                is_tuple = False
            else:
                # Unexpected character after dot, could be a syntax error
                is_tuple = False

        # Reset position
        self._pos = pos_before
        self._line = line_before
        self._col = col_before

        if is_tuple:
            return self._parse_tuple()
        else:
            return self._parse_struct()

    def _parse_struct(self) -> Dict[str, Any]:
        """Parse a ZON struct/object with key-value pairs."""
        result = {}

        while True:
            self._skip_whitespace_and_comments()

            # Check for closing brace
            if self._current_char() == "}":
                self._next_char()
                break

            # Parse key
            if self._current_char() == ".":
                self._next_char()  # Skip the dot
                key = self._parse_identifier()
            else:
                raise ValueError(
                    f"Expected '.' before key at line {self._line}, column {self._col}"
                )

            self._skip_whitespace_and_comments()

            # Parse equals sign or check if it's a shorthand notation
            if self._current_char() == "=":
                self._next_char()
                self._skip_whitespace_and_comments()
                value = self._parse_value()
            else:
                # Shorthand notation where key is the same as value
                value = key

            result[key] = value

            self._skip_whitespace_and_comments()

            # Check for comma
            if self._current_char() == ",":
                self._next_char()
            elif self._current_char() != "}":
                raise ValueError(
                    f"Expected ',' or '}}' at line {self._line}, column {self._col}"
                )

        return result

    def _parse_tuple(self) -> Union[Dict[str, Any], List[Any]]:
        """
        Parse a ZON tuple as a list of values or empty dict based on configuration.

        Returns:
            List[Any] for non-empty tuples, or Dict[str, Any] if empty and empty_tuple_as_dict=True
        """
        result = []

        # Skip the opening brace (already done in _parse_object)
        self._skip_whitespace_and_comments()

        # Check for empty tuple
        if self._current_char() == "}":
            self._next_char()
            return (
                {} if self.empty_tuple_as_dict else []
            )  # Empty dict or list based on config

        while True:
            self._skip_whitespace_and_comments()

            # Check for closing brace
            if self._current_char() == "}":
                self._next_char()
                break

            # Handle the special case of nested tuple/object with dot prefix
            if self._current_char() == ".":
                # Save position before the dot
                pos_before = self._pos
                line_before = self._line
                col_before = self._col

                self._next_char()  # Skip the dot

                # If we have a nested object/tuple
                if self._current_char() == "{":
                    # Parse the nested object/tuple
                    value = self._parse_object()
                    result.append(value)
                else:
                    # Not a nested tuple/object, reset position and parse normally
                    self._pos = pos_before
                    self._line = line_before
                    self._col = col_before

                    # Parse as normal value
                    value = self._parse_value()
                    result.append(value)
            else:
                # Regular value
                value = self._parse_value()
                result.append(value)

            self._skip_whitespace_and_comments()

            # Check for comma
            if self._current_char() == ",":
                self._next_char()
            elif self._current_char() != "}":
                raise ValueError(
                    f"Expected ',' or '}}' at line {self._line}, column {self._col}"
                )

        return result

    def _parse_identifier(self) -> str:
        start = self._pos

        # Handle quoted identifiers (like .@"lsp-codegen")
        if (
            self._current_char() == "@"
            and self._pos + 1 < len(self._content)
            and self._content[self._pos + 1] == '"'
        ):
            self._next_char()  # Skip @
            return self._parse_string()

        # Regular identifier
        while self._pos < len(self._content):
            char = self._current_char()
            if char.isalnum() or char == "_" or char == "-":
                self._next_char()
            else:
                break

        if start == self._pos:
            raise ValueError(
                f"Empty identifier at line {self._line}, column {self._col}"
            )

        return self._content[start : self._pos]

    def _parse_string(self) -> str:
        result = ""

        # Skip the opening quote
        self._next_char()

        while self._pos < len(self._content) and self._current_char() != '"':
            if self._current_char() == "\\":
                self._next_char()
                if self._current_char() == "n":
                    result += "\n"
                elif self._current_char() == "t":
                    result += "\t"
                elif self._current_char() == "r":
                    result += "\r"
                elif self._current_char() == '"':
                    result += '"'
                elif self._current_char() == "\\":
                    result += "\\"
                else:
                    result += "\\" + self._current_char()
            else:
                result += self._current_char()
            self._next_char()

        if self._current_char() != '"':
            raise ValueError(
                f"Unterminated string at line {self._line}, column {self._col}"
            )

        self._next_char()  # Skip the closing quote
        return result

    def _parse_number(self) -> Union[int, float]:
        start = self._pos

        # Handle hex numbers
        if (
            self._current_char() == "0"
            and self._pos + 1 < len(self._content)
            and self._content[self._pos + 1].lower() == "x"
        ):
            self._next_char()  # Skip 0
            self._next_char()  # Skip x

            hex_start = self._pos
            while self._pos < len(self._content) and (
                self._current_char().isdigit()
                or self._current_char().lower() in "abcdef"
            ):
                self._next_char()

            hex_str = self._content[hex_start : self._pos]
            return int(hex_str, 16)

        # Regular number
        is_float = False

        # Handle sign
        if self._current_char() == "-":
            self._next_char()

        # Handle digits before decimal point
        while self._pos < len(self._content) and self._current_char().isdigit():
            self._next_char()

        # Handle decimal point
        if self._current_char() == ".":
            is_float = True
            self._next_char()

            # Handle digits after decimal point
            while self._pos < len(self._content) and self._current_char().isdigit():
                self._next_char()

        # Handle exponent
        if self._current_char().lower() == "e":
            is_float = True
            self._next_char()

            # Handle exponent sign
            if self._current_char() in "+-":
                self._next_char()

            # Handle exponent digits
            while self._pos < len(self._content) and self._current_char().isdigit():
                self._next_char()

        num_str = self._content[start : self._pos]

        if is_float:
            return float(num_str)
        else:
            return int(num_str)

    def _parse_boolean(self) -> bool:
        if self._content[self._pos : self._pos + 4] == "true":
            self._pos += 4
            return True
        elif self._content[self._pos : self._pos + 5] == "false":
            self._pos += 5
            return False
        else:
            raise ValueError(
                f"Expected 'true' or 'false' at line {self._line}, column {self._col}"
            )


def parse_zon_file(file_path: str, empty_tuple_as_dict: bool = False) -> Dict[str, Any]:
    """
    Parse a ZON file and return a Python dictionary.

    Args:
        file_path: Path to the ZON file
        empty_tuple_as_dict: If True, empty tuples (.{}) will be parsed as empty dictionaries ({})
                           If False, empty tuples will be parsed as empty lists ([])

    Returns:
        Dictionary representation of the ZON file
    """
    logger.debug(f"Parsing ZON file: {file_path}")
    with open(file_path, "r") as f:
        content = f.read()

    parser = ZonParser(content, empty_tuple_as_dict=empty_tuple_as_dict)
    result = parser.parse()
    logger.debug(f"Successfully parsed ZON file")
    return result


def zon_to_json(
    zon_content: str, indent: Optional[int] = None, empty_tuple_as_dict: bool = False
) -> str:
    """
    Convert ZON content to JSON string.

    Args:
        zon_content: ZON content as string
        indent: Number of spaces for indentation (None for compact JSON)
        empty_tuple_as_dict: If True, empty tuples (.{}) will be parsed as empty dictionaries ({})
                           If False, empty tuples will be parsed as empty lists ([])

    Returns:
        JSON string
    """
    parser = ZonParser(zon_content, empty_tuple_as_dict=empty_tuple_as_dict)
    result = parser.parse()
    return json.dumps(result, indent=indent)