diff --git a/src/submissions/backend/excel/parsers/__init__.py b/src/submissions/backend/excel/parsers/__init__.py new file mode 100644 index 0000000..c19e1b4 --- /dev/null +++ b/src/submissions/backend/excel/parsers/__init__.py @@ -0,0 +1,34 @@ +""" + +""" +from pathlib import Path +from openpyxl import load_workbook +from backend.validators import pydant + +class DefaultParser(object): + + + default_range_dict = dict( + start_row=2, + end_row=18, + key_column=1, + value_column=2, + sheet="Sample List" + ) + + def __repr__(self): + return f"{self.__class__.__name__}<{self.filepath.stem}>" + + def __init__(self, filepath: Path | str, range_dict: dict | None = None): + self._pyd_object = getattr(pydant, f"Pyd{self.__class__.__name__.replace('Parser', '')}") + if isinstance(filepath, str): + self.filepath = Path(filepath) + else: + self.filepath = filepath + self.workbook = load_workbook(self.filepath, data_only=True) + if not range_dict: + self.range_dict = self.__class__.default_range_dict + else: + self.range_dict = range_dict + +from .submission_parser import * \ No newline at end of file diff --git a/src/submissions/backend/excel/parsers/submission_parser.py b/src/submissions/backend/excel/parsers/submission_parser.py new file mode 100644 index 0000000..832a724 --- /dev/null +++ b/src/submissions/backend/excel/parsers/submission_parser.py @@ -0,0 +1,68 @@ +""" + +""" +import logging, re +from pathlib import Path +from typing import Generator, Tuple +from pandas import DataFrame + +from . import DefaultParser + +logger = logging.getLogger(f"submissions.{__name__}") + + +class ClientSubmissionParser(DefaultParser): + """ + Object for retrieving submitter info from "sample list" sheet + """ + + def __init__(self, filepath: Path | str, range_dict: dict | None = None): + super().__init__(filepath=filepath, range_dict=range_dict) + self.worksheet = self.workbook[self.range_dict['sheet']] + self.rows = range(self.range_dict['start_row'], self.range_dict['end_row'] + 1) + + @property + def parsed_info(self) -> Generator[Tuple, None, None]: + for row in self.rows: + key = self.worksheet.cell(row, self.range_dict['key_column']).value + if key: + key = re.sub(r"\(.*\)", "", key) + key = key.lower().replace(":", "").strip().replace(" ", "_") + value = self.worksheet.cell(row, self.range_dict['value_column']).value + value = dict(value=value, missing=False if value else True) + yield key, value + + def to_pydantic(self): + data = {key: value for key, value in self.parsed_info} + data['filepath'] = self.filepath + return self._pyd_object(**data) + + +class SampleParser(DefaultParser): + """ + Object for retrieving submitter info from "sample list" sheet + """ + + default_range_dict = dict( + header_row=20, + end_row=116, + list_sheet="Sample List" + ) + + def __init__(self, filepath: Path | str, range_dict: dict | None = None): + super().__init__(filepath=filepath, range_dict=range_dict) + self.list_worksheet = self.workbook[self.range_dict['list_sheet']] + self.list_df = DataFrame([item for item in self.list_worksheet.values][self.range_dict['header_row'] - 1:]) + self.list_df.columns = self.list_df.iloc[0] + self.list_df = self.list_df[1:] + self.list_df = self.list_df.dropna(axis=1, how='all') + + @property + def parsed_info(self) -> Generator[dict, None, None]: + for ii, row in enumerate(self.list_df.iterrows()): + sample = {key.lower().replace(" ", "_"): value for key, value in row[1].to_dict().items()} + sample['submission_rank'] = ii + 1 + yield sample + + def to_pydantic(self): + return [self._pyd_object(**sample) for sample in self.parsed_info if sample['sample_id']]