Updated parser functions to include identifiers.

This commit is contained in:
lwark
2025-06-11 13:18:01 -05:00
parent 592073c2a1
commit 90dc97683f
7 changed files with 254 additions and 25 deletions

View File

@@ -13,6 +13,73 @@ from datetime import datetime
logger = logging.getLogger(f"submissions.{__name__}")
class DefaultNamer(object):
def __init__(self, filepath: str | Path, **kwargs):
if isinstance(filepath, str):
filepath = Path(filepath)
try:
assert filepath.exists()
except AssertionError:
raise FileNotFoundError(f"File {filepath} does not exist.")
self.filepath = filepath
class ClientSubmissionNamer(DefaultNamer):
def __init__(self, filepath: str | Path, submissiontype: str|SubmissionType|None=None,
data: dict | None = None, **kwargs):
super().__init__(filepath=filepath)
if not submissiontype:
submissiontype = self.retrieve_submissiontype(filepath=self.filepath)
if isinstance(submissiontype, str):
submissiontype = SubmissionType.query(name=submissiontype)
def retrieve_submissiontype(self, filepath: str | Path):
# NOTE: Attempt 1, get from form properties:
sub_type = self.get_subtype_from_properties()
if not sub_type:
# NOTE: Attempt 2, get by opening file and using default parser
logger.warning(f"Getting submissiontype from file properties failed, falling back on preparse.\nDepending on excel structure this might yield an incorrect submissiontype")
sub_type = self.get_subtype_from_preparse()
if not sub_type:
logger.warning(f"Getting submissiontype from preparse failed, falling back on filename regex.\nDepending on excel structure this might yield an incorrect submissiontype")
sub_type = self.get_subtype_from_regex()
return sub_type
def get_subtype_from_regex(self):
regex = SubmissionType.regex
m = regex.search(self.filepath.__str__())
try:
sub_type = m.lastgroup
except AttributeError as e:
sub_type = None
logger.critical(f"No procedure type found or procedure type found!: {e}")
return sub_type
def get_subtype_from_preparse(self):
from backend.excel.parsers.submission_parser import ClientSubmissionParser
parser = ClientSubmissionParser(self.filepath)
sub_type = next((value for k, value in parser.parsed_info if k == "submissiontype"), None)
sub_type = SubmissionType.query(name=sub_type)
if isinstance(sub_type, list):
sub_type = None
return sub_type
def get_subtype_from_properties(self):
wb = load_workbook(self.filepath)
# NOTE: Gets first category in the metadata.
categories = wb.properties.category.split(";")
sub_type = next((item.strip().title() for item in categories), None)
sub_type = SubmissionType.query(name=sub_type)
if isinstance(sub_type, list):
sub_type = None
return sub_type
class RSLNamer(object):
"""
@@ -25,16 +92,17 @@ class RSLNamer(object):
self.submission_type = submission_type
if not self.submission_type:
self.submission_type = self.retrieve_submission_type(filename=filename)
logger.info(f"got procedure type: {self.submission_type}")
logger.info(f"got submission type: {self.submission_type}")
if self.submission_type:
self.sub_object = BasicRun.find_polymorphic_subclass(polymorphic_identity=self.submission_type)
# self.sub_object = BasicRun.find_polymorphic_subclass(polymorphic_identity=self.submission_type)
self.sub_object = SubmissionType.query(name=submission_type, limit=1)
self.parsed_name = self.retrieve_rsl_number(filename=filename, regex=self.sub_object.get_regex(
submission_type=submission_type))
if not data:
data = dict(submission_type=self.submission_type)
if "proceduretype" not in data.keys():
data['proceduretype'] = self.submission_type
self.parsed_name = self.sub_object.enforce_name(instr=self.parsed_name, data=data)
submission_type=self.submission_type))
# if not data:
# data = dict(submission_type=self.submission_type)
# if "proceduretype" not in data.keys():
# data['proceduretype'] = self.submission_type
# self.parsed_name = self.sub_object.enforce_name(instr=self.parsed_name, data=data)
logger.info(f"Parsed name: {self.parsed_name}")
@classmethod
@@ -83,7 +151,7 @@ class RSLNamer(object):
def st_from_str(file_name: str) -> str:
if file_name.startswith("tmp"):
return "Bacterial Culture"
regex = BasicRun.regex
regex = SubmissionType.regex
m = regex.search(file_name)
try:
sub_type = m.lastgroup