Updated parser functions to include identifiers.

2025-06-11 13:18:01 -05:00
parent 592073c2a1
commit 90dc97683f
7 changed files with 254 additions and 25 deletions
--- a/src/submissions/backend/excel/parsers/init.py
+++ b/src/submissions/backend/excel/parsers/init.py
@@ -8,6 +8,7 @@ from openpyxl import load_workbook
 from pandas import DataFrame
 from backend.validators import pydant
 from backend.db.models import Procedure
+from dataclasses import dataclass

 logger = logging.getLogger(f"submissions.{__name__}")

@@ -16,16 +17,34 @@ class DefaultParser(object):
    def __repr__(self):
        return f"{self.__class__.__name__}<{self.filepath.stem}>"

+    def __new__(cls, *args, **kwargs):
+        filepath = kwargs['filepath']
+        if isinstance(filepath, str):
+            filepath = Path(filepath)
+        try:
+            assert filepath.exists()
+        except AssertionError:
+            raise FileNotFoundError(f"File {filepath} does not exist.")
+        instance = super().__new__(cls)
+        instance.filepath = filepath
+        return instance
+
+
    def __init__(self, filepath: Path | str, procedure: Procedure|None=None, range_dict: dict | None = None, *args, **kwargs):
+        """
+
+        Args:
+            filepath (Path|str): Must be given as a kwarg. eg. filepath=X
+            procedure ():
+            range_dict ():
+            *args ():
+            **kwargs ():
+        """
        self.procedure = procedure
        try:
            self._pyd_object = getattr(pydant, f"Pyd{self.__class__.__name__.replace('Parser', '')}")
        except AttributeError:
            self._pyd_object = pydant.PydResults
-        if isinstance(filepath, str):
-            self.filepath = Path(filepath)
-        else:
-            self.filepath = filepath
        self.workbook = load_workbook(self.filepath, data_only=True)
        if not range_dict:
            self.range_dict = self.__class__.default_range_dict
--- a/src/submissions/backend/excel/parsers/submission_parser.py
+++ b/src/submissions/backend/excel/parsers/submission_parser.py
@@ -2,15 +2,69 @@

 """
 import logging
+from pathlib import Path
 from string import ascii_lowercase
 from typing import Generator
+
+from openpyxl.reader.excel import load_workbook
+
 from tools import row_keys
+from backend.db.models import SubmissionType
 from . import DefaultKEYVALUEParser, DefaultTABLEParser

 logger = logging.getLogger(f"submissions.{__name__}")


-class ClientSubmissionParser(DefaultKEYVALUEParser):
+class SubmissionTyperMixin(object):
+
+    @classmethod
+    def retrieve_submissiontype(cls, filepath: Path):
+        # NOTE: Attempt 1, get from form properties:
+        sub_type = cls.get_subtype_from_properties(filepath=filepath)
+        if not sub_type:
+            # NOTE: Attempt 2, get by opening file and using default parser
+            logger.warning(
+                f"Getting submissiontype from file properties failed, falling back on preparse.\nDepending on excel structure this might yield an incorrect submissiontype")
+            sub_type = cls.get_subtype_from_preparse(filepath=filepath)
+        if not sub_type:
+            logger.warning(
+                f"Getting submissiontype from preparse failed, falling back on filename regex.\nDepending on excel structure this might yield an incorrect submissiontype")
+            sub_type = cls.get_subtype_from_regex(filepath=filepath)
+        return sub_type
+
+    @classmethod
+    def get_subtype_from_regex(cls, filepath: Path):
+        regex = SubmissionType.regex
+        m = regex.search(filepath.__str__())
+        try:
+            sub_type = m.lastgroup
+        except AttributeError as e:
+            sub_type = None
+            logger.critical(f"No procedure type found or procedure type found!: {e}")
+        return sub_type
+
+    @classmethod
+    def get_subtype_from_preparse(cls, filepath: Path):
+        parser = ClientSubmissionParser(filepath)
+        sub_type = next((value for k, value in parser.parsed_info if k == "submissiontype"), None)
+        sub_type = SubmissionType.query(name=sub_type)
+        if isinstance(sub_type, list):
+            sub_type = None
+        return sub_type
+
+    @classmethod
+    def get_subtype_from_properties(cls, filepath: Path):
+        wb = load_workbook(filepath)
+        # NOTE: Gets first category in the metadata.
+        categories = wb.properties.category.split(";")
+        sub_type = next((item.strip().title() for item in categories), None)
+        sub_type = SubmissionType.query(name=sub_type)
+        if isinstance(sub_type, list):
+            sub_type = None
+        return sub_type
+
+
+class ClientSubmissionParser(DefaultKEYVALUEParser, SubmissionTyperMixin):
    """
    Object for retrieving submitter info from "sample list" sheet
    """
@@ -23,11 +77,16 @@ class ClientSubmissionParser(DefaultKEYVALUEParser):
        sheet="Sample List"
    )]

+    def __init__(self, filepath: Path | str, *args, **kwargs):
+        self.submissiontype = self.retrieve_submissiontype(filepath=filepath)
+        if "range_dict" not in kwargs:
+            kwargs['range_dict'] = self.submissiontype.info_map
+        super().__init__(filepath=filepath, **kwargs)


-class SampleParser(DefaultTABLEParser):
+class ClientSampleParser(DefaultTABLEParser, SubmissionTyperMixin):
    """
-    Object for retrieving submitter info from "sample list" sheet
+    Object for retrieving submitter samples from "sample list" sheet
    """

    default_range_dict = [dict(
@@ -36,6 +95,12 @@ class SampleParser(DefaultTABLEParser):
        sheet="Sample List"
    )]

+    def __init__(self, filepath: Path | str, *args, **kwargs):
+        self.submissiontype = self.retrieve_submissiontype(filepath=filepath)
+        if "range_dict" not in kwargs:
+            kwargs['range_dict'] = self.submissiontype.sample_map
+        super().__init__(filepath=filepath, **kwargs)
+
    @property
    def parsed_info(self) -> Generator[dict, None, None]:
        output = super().parsed_info