"""XLSX reader — pure Python, zero dependencies. Uses ``zipfile`` + ``xml.etree.ElementTree`` to parse an .xlsx file and return a cell map matching the xls_reader interface. """ import zipfile import xml.etree.ElementTree as ET from models import FileFormatError from utils import cell_ref_to_rc # OOXML namespace — the XML uses a default namespace (no prefix), # so we build the tag names with the full URI. _S = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' def _tag(local: str) -> str: """Build a namespaced tag like {ns}row.""" return f'{{{_S}}}{local}' def read_excel_cells(filepath: str) -> dict[tuple[int, int], str]: """Read an .xlsx file and return {(row, col): str}. Rows and columns are 0-based. A1 → (0, 0). """ return XLSXReader(filepath).read_all_cells() class XLSXReader: """Read an .xlsx file and return a cell map.""" def __init__(self, filepath: str): self._filepath = filepath self._shared_strings: list[str] = [] self._cells: dict[tuple[int, int], str] = {} def read_all_cells(self) -> dict[tuple[int, int], str]: """Return {(row, col): str} for every non-empty cell.""" with zipfile.ZipFile(self._filepath, 'r') as zf: self._parse_shared_strings(zf) self._parse_sheet(zf, 'xl/worksheets/sheet1.xml') return dict(self._cells) def _parse_shared_strings(self, zf: zipfile.ZipFile): """Parse xl/sharedStrings.xml.""" try: data = zf.read('xl/sharedStrings.xml') except KeyError: return # No shared strings table root = ET.fromstring(data) self._shared_strings = [] for si in root.findall(_tag('si')): text_parts = [] for t in si.findall(f'.//{_tag("t")}'): if t.text: text_parts.append(t.text) self._shared_strings.append(''.join(text_parts)) def _parse_sheet(self, zf: zipfile.ZipFile, sheet_path: str): """Parse a worksheet XML and populate self._cells.""" try: data = zf.read(sheet_path) except KeyError: raise FileFormatError(f"Worksheet not found: {sheet_path}") root = ET.fromstring(data) sheet_data = root.find(_tag('sheetData')) if sheet_data is None: return for row_elem in sheet_data.findall(_tag('row')): row_num = int(row_elem.get('r', '0')) - 1 # 1-based → 0-based for cell_elem in row_elem.findall(_tag('c')): ref = cell_elem.get('r', '') if not ref: continue row, col = cell_ref_to_rc(ref) cell_type = cell_elem.get('t', '') value_elem = cell_elem.find(_tag('v')) value = value_elem.text if value_elem is not None else '' if cell_type == 's': # Shared string reference try: idx = int(value) value = self._shared_strings[idx] if idx < len(self._shared_strings) else value except (ValueError, IndexError): pass elif cell_type == 'b': value = 'TRUE' if value == '1' else 'FALSE' elif cell_type == 'n': # Numeric — keep as-is (will be formatted later) pass # else: inline string or default text self._cells[(row, col)] = value