pinmap-to-pinlist/Releases/v1.0.1/source/xlsx_reader.py

"""XLSX reader — pure Python, zero dependencies.

Uses ``zipfile`` + ``xml.etree.ElementTree`` to parse an .xlsx file
and return a cell map matching the xls_reader interface.
"""

import zipfile
import xml.etree.ElementTree as ET

from models import FileFormatError
from utils import cell_ref_to_rc

# OOXML namespace — the XML uses a default namespace (no prefix),
# so we build the tag names with the full URI.
_S = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'

def _tag(local: str) -> str:
    """Build a namespaced tag like {ns}row."""
    return f'{{{_S}}}{local}'


def read_excel_cells(filepath: str) -> dict[tuple[int, int], str]:
    """Read an .xlsx file and return {(row, col): str}.

    Rows and columns are 0-based.  A1 → (0, 0).
    """
    return XLSXReader(filepath).read_all_cells()


class XLSXReader:
    """Read an .xlsx file and return a cell map."""

    def __init__(self, filepath: str):
        self._filepath = filepath
        self._shared_strings: list[str] = []
        self._cells: dict[tuple[int, int], str] = {}

    def read_all_cells(self) -> dict[tuple[int, int], str]:
        """Return {(row, col): str} for every non-empty cell."""
        with zipfile.ZipFile(self._filepath, 'r') as zf:
            self._parse_shared_strings(zf)
            self._parse_sheet(zf, 'xl/worksheets/sheet1.xml')
        return dict(self._cells)

    def _parse_shared_strings(self, zf: zipfile.ZipFile):
        """Parse xl/sharedStrings.xml."""
        try:
            data = zf.read('xl/sharedStrings.xml')
        except KeyError:
            return  # No shared strings table

        root = ET.fromstring(data)
        self._shared_strings = []
        for si in root.findall(_tag('si')):
            text_parts = []
            for t in si.findall(f'.//{_tag("t")}'):
                if t.text:
                    text_parts.append(t.text)
            self._shared_strings.append(''.join(text_parts))

    def _parse_sheet(self, zf: zipfile.ZipFile, sheet_path: str):
        """Parse a worksheet XML and populate self._cells."""
        try:
            data = zf.read(sheet_path)
        except KeyError:
            raise FileFormatError(f"Worksheet not found: {sheet_path}")

        root = ET.fromstring(data)
        sheet_data = root.find(_tag('sheetData'))
        if sheet_data is None:
            return
        for row_elem in sheet_data.findall(_tag('row')):
            row_num = int(row_elem.get('r', '0')) - 1  # 1-based → 0-based
            for cell_elem in row_elem.findall(_tag('c')):
                ref = cell_elem.get('r', '')
                if not ref:
                    continue
                row, col = cell_ref_to_rc(ref)
                cell_type = cell_elem.get('t', '')
                value_elem = cell_elem.find(_tag('v'))
                value = value_elem.text if value_elem is not None else ''

                if cell_type == 's':
                    # Shared string reference
                    try:
                        idx = int(value)
                        value = self._shared_strings[idx] if idx < len(self._shared_strings) else value
                    except (ValueError, IndexError):
                        pass
                elif cell_type == 'b':
                    value = 'TRUE' if value == '1' else 'FALSE'
                elif cell_type == 'n':
                    # Numeric — keep as-is (will be formatted later)
                    pass
                # else: inline string or default text

                self._cells[(row, col)] = value