Large scale refactor to improve db efficiency

2023-09-27 14:16:28 -05:00
parent 82ab06efad
commit e484eabb22
37 changed files with 1782 additions and 1697 deletions
--- a/src/submissions/backend/excel/parser.py
+++ b/src/submissions/backend/excel/parser.py
@@ -6,15 +6,14 @@ import pprint
 from typing import List
 import pandas as pd
 from pathlib import Path
-from backend.db import lookup_sample_by_submitter_id, get_reagents_in_extkit, lookup_kittype_by_name, lookup_submissiontype_by_name, models
+from backend.db import models, lookup_kit_types, lookup_submission_type, lookup_samples
 from backend.pydant import PydSubmission, PydReagent
 import logging
 from collections import OrderedDict
 import re
-import numpy as np
 from datetime import date
 from dateutil.parser import parse, ParserError
-from tools import check_not_nan, RSLNamer, convert_nans_to_nones, Settings, convert_well_to_row_column
+from tools import check_not_nan, RSLNamer, convert_nans_to_nones, Settings
 from frontend.custom_widgets.pop_ups import SubmissionTypeSelector, KitSelector

 logger = logging.getLogger(f"submissions.{__name__}")
@@ -106,11 +105,14 @@ class SheetParser(object):
                    self.sub[k] = v
        logger.debug(f"Parser.sub after info scrape: {pprint.pformat(self.sub)}")

-    def parse_reagents(self):
+    def parse_reagents(self, extraction_kit:str|None=None):
        """
        Pulls reagent info from the excel sheet
        """        
-        self.sub['reagents'] = ReagentParser(ctx=self.ctx, xl=self.xl, submission_type=self.sub['submission_type'], extraction_kit=self.sub['extraction_kit']).parse_reagents()
+        if extraction_kit == None:
+            extraction_kit = extraction_kit=self.sub['extraction_kit']
+        logger.debug(f"Parsing reagents for {extraction_kit}")
+        self.sub['reagents'] = ReagentParser(ctx=self.ctx, xl=self.xl, submission_type=self.sub['submission_type'], extraction_kit=extraction_kit).parse_reagents()

    def parse_samples(self):
        """
@@ -180,7 +182,8 @@ class SheetParser(object):
        """
        Enforce that only allowed reagents get into the Pydantic Model
        """          
-        allowed_reagents = [item.name for item in get_reagents_in_extkit(ctx=self.ctx, kit_name=self.sub['extraction_kit']['value'])]
+        kit = lookup_kit_types(ctx=self.ctx, name=self.sub['extraction_kit']['value'])
+        allowed_reagents = [item.name for item in kit.get_reagents()]
        logger.debug(f"List of reagents for comparison with allowed_reagents: {pprint.pformat(self.sub['reagents'])}")
        self.sub['reagents'] = [reagent for reagent in self.sub['reagents'] if reagent['value'].type in allowed_reagents]
        
@@ -217,7 +220,8 @@ class InfoParser(object):
        if isinstance(submission_type, str):
            submission_type = dict(value=submission_type, parsed=False)
        logger.debug(f"Looking up submission type: {submission_type['value']}")
-        submission_type = lookup_submissiontype_by_name(ctx=self.ctx, type_name=submission_type['value'])
+        # submission_type = lookup_submissiontype_by_name(ctx=self.ctx, type_name=submission_type['value'])
+        submission_type = lookup_submission_type(ctx=self.ctx, name=submission_type['value'])
        info_map = submission_type.info_map
        return info_map

@@ -269,7 +273,9 @@ class ReagentParser(object):
        self.xl = xl

    def fetch_kit_info_map(self, extraction_kit:dict, submission_type:str):
-        kit = lookup_kittype_by_name(ctx=self.ctx, name=extraction_kit['value'])
+        if isinstance(extraction_kit, dict):
+            extraction_kit = extraction_kit['value']
+        kit = lookup_kit_types(ctx=self.ctx, name=extraction_kit)
        if isinstance(submission_type, dict):
            submission_type = submission_type['value']
        reagent_map = kit.construct_xl_map_for_use(submission_type.title())
@@ -300,9 +306,9 @@ class ReagentParser(object):
                logger.debug(f"Got lot for {item}-{name}: {lot} as {type(lot)}")
                lot = str(lot)
                listo.append(dict(value=PydReagent(type=item.strip(), lot=lot, exp=expiry, name=name), parsed=parsed))
+        logger.debug(f"Returning listo: {listo}")
        return listo

-
 class SampleParser(object):
    """
    object to pull data for samples in excel sheet and construct individual sample objects
@@ -331,23 +337,48 @@ class SampleParser(object):
        if isinstance(self.lookup_table, pd.DataFrame):
            self.parse_lookup_table()
        
-    def fetch_sample_info_map(self, submission_type:dict) -> dict:
+    def fetch_sample_info_map(self, submission_type:str) -> dict:
+        """
+        Gets info locations in excel book for submission type.
+
+        Args:
+            submission_type (str): submission type
+
+        Returns:
+            dict: Info locations.
+        """        
        logger.debug(f"Looking up submission type: {submission_type}")
-        submission_type = lookup_submissiontype_by_name(ctx=self.ctx, type_name=submission_type)
+        submission_type = lookup_submission_type(ctx=self.ctx, name=submission_type)
        logger.debug(f"info_map: {pprint.pformat(submission_type.info_map)}")
        sample_info_map = submission_type.info_map['samples']
        return sample_info_map

    def construct_plate_map(self, plate_map_location:dict) -> pd.DataFrame:
+        """
+        Gets location of samples from plate map grid in excel sheet.
+
+        Args:
+            plate_map_location (dict): sheet name, start/end row/column
+
+        Returns:
+            pd.DataFrame: Plate map grid
+        """        
        df = self.xl.parse(plate_map_location['sheet'], header=None, dtype=object)
        df = df.iloc[plate_map_location['start_row']-1:plate_map_location['end_row'], plate_map_location['start_column']-1:plate_map_location['end_column']]
-        # logger.debug(f"Input dataframe for plate map: {df}")
        df = pd.DataFrame(df.values[1:], columns=df.iloc[0])
        df = df.set_index(df.columns[0])
-        # logger.debug(f"Output dataframe for plate map: {df}")
        return df
    
-    def construct_lookup_table(self, lookup_table_location) -> pd.DataFrame:
+    def construct_lookup_table(self, lookup_table_location:dict) -> pd.DataFrame:
+        """
+        Gets table of misc information from excel book
+
+        Args:
+            lookup_table_location (dict): sheet name, start/end row
+
+        Returns:
+            pd.DataFrame: _description_
+        """        
        try:
            df = self.xl.parse(lookup_table_location['sheet'], header=None, dtype=object)
        except KeyError:
@@ -355,16 +386,17 @@ class SampleParser(object):
        df = df.iloc[lookup_table_location['start_row']-1:lookup_table_location['end_row']]
        df = pd.DataFrame(df.values[1:], columns=df.iloc[0])
        df = df.reset_index(drop=True)
-        # logger.debug(f"Dataframe for lookup table: {df}")
        return df
    
    def create_basic_dictionaries_from_plate_map(self):
+        """
+        Parse sample location/name from plate map
+        """        
        invalids = [0, "0", "EMPTY"]
        new_df = self.plate_map.dropna(axis=1, how='all')
        columns = new_df.columns.tolist()
        for _, iii in new_df.iterrows():
            for c in columns:
-                # logger.debug(f"Checking sample {iii[c]}")
                if check_not_nan(iii[c]):
                    if iii[c] in invalids:
                        logger.debug(f"Invalid sample name: {iii[c]}, skipping.")
@@ -378,8 +410,10 @@ class SampleParser(object):
                    self.samples.append(dict(submitter_id=id, row=row_keys[iii._name], column=c))
    
    def parse_lookup_table(self):
+        """
+        Parse misc info from lookup table.
+        """        
        def determine_if_date(input_str) -> str|date:
-            # logger.debug(f"Looks like we have a str: {input_str}")
            regex = re.compile(r"^\d{4}-?\d{2}-?\d{2}")
            if bool(regex.search(input_str)):
                logger.warning(f"{input_str} is a date!")
@@ -407,11 +441,19 @@ class SampleParser(object):
                                sample[k] = v
            logger.debug(f"Output sample dict: {sample}")

-    def parse_samples(self, generate:bool=True) -> List[dict]:
+    def parse_samples(self, generate:bool=True) -> List[dict]|List[models.BasicSample]:
+        """
+        Parse merged platemap\lookup info into dicts/samples
+
+        Args:
+            generate (bool, optional): Indicates if sample objects to be generated from dicts. Defaults to True.
+
+        Returns:
+            List[dict]|List[models.BasicSample]: List of samples
+        """        
        result = None
        new_samples = []
        for ii, sample in enumerate(self.samples):
-            # logger.debug(f"\n\n{new_samples}\n\n")
            try:
                if sample['submitter_id'] in [check_sample['sample'].submitter_id for check_sample in new_samples]:
                    sample['submitter_id'] = f"{sample['submitter_id']}-{ii}"
@@ -432,7 +474,6 @@ class SampleParser(object):
                    translated_dict[k] = convert_nans_to_nones(v)
            translated_dict['sample_type'] = f"{self.submission_type} Sample"
            parser_query = f"parse_{translated_dict['sample_type'].replace(' ', '_').lower()}"
-            # logger.debug(f"New sample dictionary going into object creation:\n{translated_dict}")
            try:
                custom_parser = getattr(self, parser_query)
                translated_dict = custom_parser(translated_dict)
@@ -445,6 +486,15 @@ class SampleParser(object):
        return result, new_samples

    def generate_sample_object(self, input_dict) -> models.BasicSample:
+        """
+        Constructs sample object from dict
+
+        Args:
+            input_dict (dict): sample information
+
+        Returns:
+            models.BasicSample: Sample object
+        """        
        query = input_dict['sample_type'].replace(" ", "")
        try:
            database_obj = getattr(models, query)
@@ -452,13 +502,12 @@ class SampleParser(object):
            logger.error(f"Could not find the model {query}. Using generic.")
            database_obj = models.BasicSample
        logger.debug(f"Searching database for {input_dict['submitter_id']}...")
-        instance = lookup_sample_by_submitter_id(ctx=self.ctx, submitter_id=input_dict['submitter_id'])
+        instance = lookup_samples(ctx=self.ctx, submitter_id=input_dict['submitter_id'])
        if instance == None:
            logger.debug(f"Couldn't find sample {input_dict['submitter_id']}. Creating new sample.")
            instance = database_obj()
            for k,v in input_dict.items():
                try:
-                    # setattr(instance, k, v)
                    instance.set_attribute(k, v)
                except Exception as e:
                    logger.error(f"Failed to set {k} due to {type(e).__name__}: {e}")
@@ -511,12 +560,27 @@ class SampleParser(object):
        return input_dict
    
    def parse_first_strand_sample(self, input_dict:dict) -> dict:
+        """
+        Update sample dictionary with first strand specific information
+
+        Args:
+            input_dict (dict): Input sample dictionary
+
+        Returns:
+            dict: Updated sample dictionary
+        """        
        logger.debug("Called first strand sample parser")
        input_dict['well'] = re.search(r"\s\((.*)\)$", input_dict['submitter_id']).groups()[0]
        input_dict['submitter_id'] = re.sub(r"\s\(.*\)$", "", str(input_dict['submitter_id'])).strip()
        return input_dict
    
-    def grab_plates(self):
+    def grab_plates(self) -> List[str]:
+        """
+        Parse plate names from 
+
+        Returns:
+            List[str]: list of plate names.
+        """        
        plates = []
        for plate in self.plates:
            df = self.xl.parse(plate['sheet'], header=None)
@@ -526,8 +590,7 @@ class SampleParser(object):
                continue
            plates.append(output)
        return plates
-
-        
+    
 class PCRParser(object):
    """
    Object to pull data from Design and Analysis PCR export file.
@@ -574,7 +637,6 @@ class PCRParser(object):
            sheet_name (str): Name of sheet in excel workbook that holds info.
        """        
        df = self.xl.parse(sheet_name=sheet_name, dtype=object).fillna("")
-        # self.pcr['file'] = df.iloc[1][1]
        self.pcr['comment'] = df.iloc[0][1]
        self.pcr['operator'] = df.iloc[1][1]
        self.pcr['barcode'] = df.iloc[2][1]
@@ -615,7 +677,6 @@ class PCRParser(object):
        except ValueError:
            logger.error("Well call number doesn't match sample number")
        logger.debug(f"Well call df: {well_call_df}")
-        # iloc is [row][column]
        for ii, row in self.samples_df.iterrows():
            try:
                sample_obj = [sample for sample in self.samples if sample['sample'] == row[3]][0]    
@@ -623,14 +684,8 @@ class PCRParser(object):
                sample_obj = dict(
                    sample = row['Sample'],
                    plate_rsl = self.plate_num,
-                    # elution_well = row['Well Position']
                )
            logger.debug(f"Got sample obj: {sample_obj}") 
-            # logger.debug(f"row: {row}")
-            # rsl_num = row[3]
-            # # logger.debug(f"Looking up: {rsl_num}")
-            # ww_samp = lookup_ww_sample_by_rsl_sample_number(ctx=self.ctx, rsl_number=rsl_num)
-            # logger.debug(f"Got: {ww_samp}")
            if isinstance(row['Cq'], float):
                sample_obj[f"ct_{row['Target'].lower()}"] = row['Cq']
            else:
@@ -639,20 +694,6 @@ class PCRParser(object):
                sample_obj[f"{row['Target'].lower()}_status"] = row['Assessment']
            except KeyError:
                logger.error(f"No assessment for {sample_obj['sample']}")
-            # match row["Target"]:
-            #     case "N1":
-            #         if isinstance(row['Cq'], float):
-            #             sample_obj['ct_n1'] = row["Cq"]
-            #         else:
-            #             sample_obj['ct_n1'] = 0.0
-            #         sample_obj['n1_status'] = row['Assessment']
-            #     case "N2":
-            #         if isinstance(row['Cq'], float):
-            #             sample_obj['ct_n2'] = row['Assessment']
-            #         else:
-            #             sample_obj['ct_n2'] = 0.0
-            #     case _:
-            #         logger.warning(f"Unexpected input for row[4]: {row["Target"]}")
            self.samples.append(sample_obj)
        

--- a/src/submissions/backend/excel/reports.py
+++ b/src/submissions/backend/excel/reports.py
@@ -14,7 +14,7 @@ env = jinja_template_loading()

 logger = logging.getLogger(f"submissions.{__name__}")

-def make_report_xlsx(records:list[dict]) -> DataFrame:
+def make_report_xlsx(records:list[dict]) -> Tuple[DataFrame, DataFrame]:
    """
    create the dataframe for a report

@@ -92,7 +92,6 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -
    """    
    
    df = DataFrame.from_records(input)
-    # df.to_excel("test.xlsx", engine="openpyxl")
    safe = ['name', 'submitted_date', 'genus', 'target']
    for column in df.columns:
        if "percent" in column:
@@ -102,7 +101,6 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -
        if column not in safe:
            if subtype != None and column != subtype:
                del df[column]
-    # logger.debug(df)
    # move date of sample submitted on same date as previous ahead one.
    df = displace_date(df)
    # ad hoc method to make data labels more accurate.
@@ -215,14 +213,10 @@ def drop_reruns_from_df(ctx:dict, df: DataFrame) -> DataFrame:
    """    
    if 'rerun_regex' in ctx:
        sample_names = get_unique_values_in_df_column(df, column_name="name")
-        # logger.debug(f"Compiling regex from: {settings['rerun_regex']}")
        rerun_regex = re.compile(fr"{ctx['rerun_regex']}")
        for sample in sample_names:
-            # logger.debug(f'Running search on {sample}')
            if rerun_regex.search(sample):
-                # logger.debug(f'Match on {sample}')
                first_run = re.sub(rerun_regex, "", sample)
-                # logger.debug(f"First run: {first_run}")
                df = df.drop(df[df.name == first_run].index)
    return df