Updated parser functions to include identifiers.

This commit is contained in:
lwark
2025-06-11 13:18:01 -05:00
parent 592073c2a1
commit 90dc97683f
7 changed files with 254 additions and 25 deletions

View File

@@ -8,6 +8,7 @@ from openpyxl import load_workbook
from pandas import DataFrame
from backend.validators import pydant
from backend.db.models import Procedure
from dataclasses import dataclass
logger = logging.getLogger(f"submissions.{__name__}")
@@ -16,16 +17,34 @@ class DefaultParser(object):
def __repr__(self):
return f"{self.__class__.__name__}<{self.filepath.stem}>"
def __new__(cls, *args, **kwargs):
filepath = kwargs['filepath']
if isinstance(filepath, str):
filepath = Path(filepath)
try:
assert filepath.exists()
except AssertionError:
raise FileNotFoundError(f"File {filepath} does not exist.")
instance = super().__new__(cls)
instance.filepath = filepath
return instance
def __init__(self, filepath: Path | str, procedure: Procedure|None=None, range_dict: dict | None = None, *args, **kwargs):
"""
Args:
filepath (Path|str): Must be given as a kwarg. eg. filepath=X
procedure ():
range_dict ():
*args ():
**kwargs ():
"""
self.procedure = procedure
try:
self._pyd_object = getattr(pydant, f"Pyd{self.__class__.__name__.replace('Parser', '')}")
except AttributeError:
self._pyd_object = pydant.PydResults
if isinstance(filepath, str):
self.filepath = Path(filepath)
else:
self.filepath = filepath
self.workbook = load_workbook(self.filepath, data_only=True)
if not range_dict:
self.range_dict = self.__class__.default_range_dict

View File

@@ -2,15 +2,69 @@
"""
import logging
from pathlib import Path
from string import ascii_lowercase
from typing import Generator
from openpyxl.reader.excel import load_workbook
from tools import row_keys
from backend.db.models import SubmissionType
from . import DefaultKEYVALUEParser, DefaultTABLEParser
logger = logging.getLogger(f"submissions.{__name__}")
class ClientSubmissionParser(DefaultKEYVALUEParser):
class SubmissionTyperMixin(object):
@classmethod
def retrieve_submissiontype(cls, filepath: Path):
# NOTE: Attempt 1, get from form properties:
sub_type = cls.get_subtype_from_properties(filepath=filepath)
if not sub_type:
# NOTE: Attempt 2, get by opening file and using default parser
logger.warning(
f"Getting submissiontype from file properties failed, falling back on preparse.\nDepending on excel structure this might yield an incorrect submissiontype")
sub_type = cls.get_subtype_from_preparse(filepath=filepath)
if not sub_type:
logger.warning(
f"Getting submissiontype from preparse failed, falling back on filename regex.\nDepending on excel structure this might yield an incorrect submissiontype")
sub_type = cls.get_subtype_from_regex(filepath=filepath)
return sub_type
@classmethod
def get_subtype_from_regex(cls, filepath: Path):
regex = SubmissionType.regex
m = regex.search(filepath.__str__())
try:
sub_type = m.lastgroup
except AttributeError as e:
sub_type = None
logger.critical(f"No procedure type found or procedure type found!: {e}")
return sub_type
@classmethod
def get_subtype_from_preparse(cls, filepath: Path):
parser = ClientSubmissionParser(filepath)
sub_type = next((value for k, value in parser.parsed_info if k == "submissiontype"), None)
sub_type = SubmissionType.query(name=sub_type)
if isinstance(sub_type, list):
sub_type = None
return sub_type
@classmethod
def get_subtype_from_properties(cls, filepath: Path):
wb = load_workbook(filepath)
# NOTE: Gets first category in the metadata.
categories = wb.properties.category.split(";")
sub_type = next((item.strip().title() for item in categories), None)
sub_type = SubmissionType.query(name=sub_type)
if isinstance(sub_type, list):
sub_type = None
return sub_type
class ClientSubmissionParser(DefaultKEYVALUEParser, SubmissionTyperMixin):
"""
Object for retrieving submitter info from "sample list" sheet
"""
@@ -23,11 +77,16 @@ class ClientSubmissionParser(DefaultKEYVALUEParser):
sheet="Sample List"
)]
def __init__(self, filepath: Path | str, *args, **kwargs):
self.submissiontype = self.retrieve_submissiontype(filepath=filepath)
if "range_dict" not in kwargs:
kwargs['range_dict'] = self.submissiontype.info_map
super().__init__(filepath=filepath, **kwargs)
class SampleParser(DefaultTABLEParser):
class ClientSampleParser(DefaultTABLEParser, SubmissionTyperMixin):
"""
Object for retrieving submitter info from "sample list" sheet
Object for retrieving submitter samples from "sample list" sheet
"""
default_range_dict = [dict(
@@ -36,6 +95,12 @@ class SampleParser(DefaultTABLEParser):
sheet="Sample List"
)]
def __init__(self, filepath: Path | str, *args, **kwargs):
self.submissiontype = self.retrieve_submissiontype(filepath=filepath)
if "range_dict" not in kwargs:
kwargs['range_dict'] = self.submissiontype.sample_map
super().__init__(filepath=filepath, **kwargs)
@property
def parsed_info(self) -> Generator[dict, None, None]:
output = super().parsed_info