Before creating info and reagent parser classes.

2023-08-21 13:50:38 -05:00
parent af810ae528
commit b6de159631
20 changed files with 1176 additions and 571 deletions
--- a/src/submissions/backend/excel/parser.py
+++ b/src/submissions/backend/excel/parser.py
@@ -6,8 +6,8 @@ import pprint
 from typing import Tuple
 import pandas as pd
 from pathlib import Path
-from backend.db.models import WWSample, BCSample
-from backend.db import lookup_ww_sample_by_ww_sample_num
+from backend.db.models import WastewaterSample, BacterialCultureSample
+from backend.db import lookup_ww_sample_by_ww_sample_num, lookup_sample_by_submitter_id, get_reagents_in_extkit, lookup_kittype_by_name, lookup_kittype_by_use
 from backend.pydant import PydSubmission, PydReagent
 import logging
 from collections import OrderedDict
@@ -15,10 +15,14 @@ import re
 import numpy as np
 from datetime import date, datetime
 import uuid
+# from submissions.backend.db.functions import 
 from tools import check_not_nan, RSLNamer, massage_common_reagents, convert_nans_to_nones, Settings
+from frontend.custom_widgets.pop_ups import SubmissionTypeSelector, KitSelector

 logger = logging.getLogger(f"submissions.{__name__}")

+row_keys = dict(A=1, B=2, C=3, D=4, E=5, F=6, G=7, H=8)
+
 class SheetParser(object):
    """
    object to pull and contain data from excel file
@@ -46,9 +50,13 @@ class SheetParser(object):
        # make decision about type of sample we have
        self.sub['submission_type'] = self.type_decider()
        # select proper parser based on sample type
-        parse_sub = getattr(self, f"parse_{self.sub['submission_type'].lower()}")
+        parse_sub = getattr(self, f"parse_{self.sub['submission_type'].replace(' ', '_').lower()}")
        parse_sub()
        # self.calculate_column_count()
+        self.import_kit_validation_check()
+        self.parse_reagents()
+        self.import_reagent_validation_check()
+        

    def type_decider(self) -> str:
        """
@@ -74,7 +82,13 @@ class SheetParser(object):
                return "Unknown"
            except Exception as e:
                logger.warning(f"We were unable to parse the submission type due to: {e}")
-                return "Unknown"
+                # return "Unknown"
+                dlg = SubmissionTypeSelector(ctx=self.ctx, title="Select Submission Type", message="We were unable to find the submission type from the excel metadata. Please select from below.")
+                if dlg.exec():
+                    return dlg.getValues()
+                else:
+                    logger.warning(f"Last attempt at getting submission was rejected.")
+                    raise ValueError("Submission Type needed.")

    def parse_unknown(self) -> None:
        """
@@ -173,9 +187,10 @@ class SheetParser(object):
        self.sub['reagents'] = []
        reagent_range = submission_info.iloc[1:14, 4:8]
        logger.debug(reagent_range)
-        parse_reagents(reagent_range)
+        # parse_reagents(reagent_range)
        # get individual sample info
        sample_parser = SampleParser(self.ctx, submission_info.iloc[16:112])
+        logger.debug(f"Sample type: {self.sub['submission_type']}")
        sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type']['value'].replace(' ', '_').lower()}_samples")
        logger.debug(f"Parser result: {self.sub}")
        self.sample_result, self.sub['samples'] = sample_parse()
@@ -200,7 +215,7 @@ class SheetParser(object):
            """
            # iterate through sub-df rows
            for ii, row in df.iterrows():
-                logger.debug(f"Parsing this row for reagents: {row}")
+                # logger.debug(f"Parsing this row for reagents: {row}")
                if check_not_nan(row[5]):
                    # must be prefixed with 'lot_' to be recognized by gui
                    # regex below will remove 80% from 80% ethanol in the Wastewater kit.
@@ -246,9 +261,9 @@ class SheetParser(object):
            parsed = False
        self.sub['technician'] = dict(value=f"Enr: {enrichment_info.columns[2]}, Ext: {extraction_info.columns[2]}, PCR: {qprc_info.columns[2]}", parsed=parsed)
        self.sub['reagents'] = []
-        parse_reagents(enr_reagent_range)
-        parse_reagents(ext_reagent_range)
-        parse_reagents(pcr_reagent_range)
+        # parse_reagents(enr_reagent_range)
+        # parse_reagents(ext_reagent_range)
+        # parse_reagents(pcr_reagent_range)
        # parse samples
        sample_parser = SampleParser(self.ctx, submission_info.iloc[16:], elution_map=retrieve_elution_map())
        sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type']['value'].lower()}_samples")
@@ -263,7 +278,7 @@ class SheetParser(object):
        def parse_reagents(df:pd.DataFrame):
            logger.debug(df)
            for ii, row in df.iterrows():
-                if check_not_nan(row[0]):
+                if check_not_nan(row[1]):
                    try:
                        output_key = re.sub(r"\(.+?\)", "", row[0].lower().strip().replace(' ', '_'))
                    except AttributeError:
@@ -294,7 +309,20 @@ class SheetParser(object):
                    self.sub['reagents'].append(PydReagent(type=output_key, lot=output_var, exp=expiry))
                else:
                    continue
-        def massage_samples(df:pd.DataFrame) -> pd.DataFrame:
+        def massage_samples(df:pd.DataFrame, lookup_table:pd.DataFrame) -> pd.DataFrame:
+            """
+            Takes sample info from Artic sheet format and converts to regular formate
+
+            Args:
+                df (pd.DataFrame): Elution plate map
+                lookup_table (pd.DataFrame): Sample submission form map.
+
+            Returns:
+                pd.DataFrame: _description_
+            """            
+            lookup_table.set_index(lookup_table.columns[0], inplace=True)
+            lookup_table.columns = lookup_table.iloc[0]
+            logger.debug(f"Massaging samples from {lookup_table}")
            df.set_index(df.columns[0], inplace=True)
            df.columns = df.iloc[0]
            logger.debug(f"df to massage\n: {df}")
@@ -305,10 +333,17 @@ class SheetParser(object):
                        continue
                    logger.debug(f"Checking {ii.name}{c}")
                    if check_not_nan(df.loc[ii.name, int(c)]) and df.loc[ii.name, int(c)] != "EMPTY":
+                        sample_name = df.loc[ii.name, int(c)]
+                        row = lookup_table.loc[lookup_table['Sample Name (WW)'] == sample_name]
+                        logger.debug(f"Looking up {row['Sample Name (LIMS)'][-1]}")
                        try:
-                            return_list.append(dict(sample_name=re.sub(r"\s?\(.*\)", "", df.loc[ii.name, int(c)]), \
-                                                well=f"{ii.name}{c}",
-                                                artic_plate=self.sub['rsl_plate_num']))
+                            return_list.append(dict(submitter_id=re.sub(r"\s?\(.*\)", "", df.loc[ii.name, int(c)]), \
+                                                # well=f"{ii.name}{c}",
+                                                row = row_keys[ii.name],
+                                                column = c,
+                                                artic_plate=self.sub['rsl_plate_num'],
+                                                sample_name=row['Sample Name (LIMS)'][-1]
+                                                ))
                        except TypeError as e:
                            logger.error(f"Got an int for {c}, skipping.")
                            continue
@@ -333,13 +368,69 @@ class SheetParser(object):
        self.sub['extraction_kit'] = "ArticV4.1"
        self.sub['technician'] = f"MM: {biomek_info.iloc[2][1]}, Bio: {biomek_info.iloc[3][1]}"
        self.sub['reagents'] = []
-        parse_reagents(sub_reagent_range)
-        parse_reagents(biomek_reagent_range)
-        samples = massage_samples(biomek_info.iloc[22:31, 0:])
+        # parse_reagents(sub_reagent_range)
+        # parse_reagents(biomek_reagent_range)
+        samples = massage_samples(biomek_info.iloc[22:31, 0:], submission_info.iloc[4:37, 1:5])
        # samples = massage_samples(biomek_info.iloc[25:33, 0:])
        sample_parser = SampleParser(self.ctx, pd.DataFrame.from_records(samples))
        sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type']['value'].lower()}_samples")
        self.sample_result, self.sub['samples'] = sample_parse()
+
+    def parse_reagents(self):
+        ext_kit = lookup_kittype_by_name(ctx=self.ctx, name=self.sub['extraction_kit'])
+        if ext_kit != None:
+            logger.debug(f"Querying extraction kit: {self.sub['submission_type']}")
+            reagent_map = ext_kit.construct_xl_map_for_use(use=self.sub['submission_type']['value'])
+            logger.debug(f"Reagent map: {pprint.pformat(reagent_map)}")
+        else:
+            raise AttributeError("No extraction kit found, unable to parse reagents")
+        for sheet in self.xl.sheet_names:
+            df = self.xl.parse(sheet)
+            relevant = {k:v for k,v in reagent_map.items() if sheet in reagent_map[k]['sheet']}
+            logger.debug(f"relevant map for {sheet}: {pprint.pformat(relevant)}")
+            if relevant == {}:
+                continue
+            for item in reagent_map:
+                try:
+                # role = item
+                    name = df.iat[relevant[item]['name']['row']-2, relevant[item]['name']['column']-1]
+                    lot = df.iat[relevant[item]['lot']['row']-2, relevant[item]['lot']['column']-1]
+                    expiry = df.iat[relevant[item]['expiry']['row']-2, relevant[item]['expiry']['column']-1]
+                except (KeyError, IndexError):
+                    continue
+                # self.sub['reagents'].append(dict(name=name, lot=lot, expiry=expiry, role=role))
+                self.sub['reagents'].append(PydReagent(type=item, lot=lot, exp=expiry, name=name))
+
+
+    def import_kit_validation_check(self):
+        """
+        Enforce that the parser has an extraction kit
+
+        Args:
+            ctx (Settings): Settings obj passed down from gui
+            parser_sub (dict): The parser dictionary before going to pydantic
+
+        Returns:
+            List[PydReagent]: List of reagents
+        """    
+        if not check_not_nan(self.sub['extraction_kit']):
+            dlg = KitSelector(ctx=self.ctx, title="Kit Needed", message="At minimum a kit is needed. Please select one.")
+            if dlg.exec():
+                self.sub['extraction_kit'] = dict(value=dlg.getValues(), parsed=False)
+            else:
+                raise ValueError("Extraction kit needed.")
+        else:
+            self.sub['extraction_kit'] = dict(value=self.sub['extraction_kit'], parsed=False)
+        
+        # logger.debug(f"Here is the validated parser dictionary:\n\n{pprint.pformat(self.sub)}\n\n")
+        # return parser_sub
+
+    def import_reagent_validation_check(self):
+        """
+        Enforce that only allowed reagents get into the Pydantic Model
+        """          
+        allowed_reagents = [item.name for item in get_reagents_in_extkit(ctx=self.ctx, kit_name=self.sub['extraction_kit']['value'])]
+        self.sub['reagents'] = [reagent for reagent in self.sub['reagents'] if reagent.type in allowed_reagents]
        
    def to_pydantic(self) -> PydSubmission:
        """
@@ -352,8 +443,9 @@ class SheetParser(object):
        psm = PydSubmission(ctx=self.ctx, filepath=self.filepath, **self.sub)
        delattr(psm, "filepath")
        return psm
-
    
+
+
 class SampleParser(object):
    """
    object to pull data for samples in excel sheet and construct individual sample objects
@@ -373,34 +465,36 @@ class SampleParser(object):
        self.elution_map = elution_map


-    def parse_bacterial_culture_samples(self) -> Tuple[str|None, list[BCSample]]:
+    def parse_bacterial_culture_samples(self) -> Tuple[str|None, list[dict]]:
        """
        construct bacterial culture specific sample objects

        Returns:
            list[BCSample]: list of sample objects
        """       
-        # logger.debug(f"Samples: {self.samples}") 
+        # logger.debug(f"Samples: {self.samples}")
+        
        new_list = []
        for sample in self.samples:
-            new = BCSample()
-            new.well_number = sample['This section to be filled in completely by submittor']
-            new.sample_id = sample['Unnamed: 1']
-            new.organism = sample['Unnamed: 2']
-            new.concentration = sample['Unnamed: 3']
+            logger.debug(f"Well info: {sample['This section to be filled in completely by submittor']}")
+            instance = lookup_sample_by_submitter_id(ctx=self.ctx, submitter_id=sample['Unnamed: 1'])
+            if instance == None:
+                instance = BacterialCultureSample()
+            well_number = sample['This section to be filled in completely by submittor']
+            row = row_keys[well_number[0]]
+            column = int(well_number[1:])
+            instance.submitter_id = sample['Unnamed: 1']
+            instance.organism = sample['Unnamed: 2']
+            instance.concentration = sample['Unnamed: 3']
            # logger.debug(f"Sample object: {new.sample_id} = {type(new.sample_id)}")
-            logger.debug(f"Got sample_id: {new.sample_id}")
+            logger.debug(f"Got sample_id: {instance.submitter_id}")
            # need to exclude empties and blanks
-            try:
-                not_a_nan = not np.isnan(new.sample_id) and str(new.sample_id).lower() != 'blank'
-            except TypeError:
-                not_a_nan = True
-            if not_a_nan:
-                new_list.append(new)
+            if check_not_nan(instance.submitter_id):
+                new_list.append(dict(sample=instance, row=row, column=column))
        return None, new_list


-    def parse_wastewater_samples(self) -> Tuple[str|None, list[WWSample]]:
+    def parse_wastewater_samples(self) -> Tuple[str|None, list[dict]]:
        """
        construct wastewater specific sample objects

@@ -408,60 +502,95 @@ class SampleParser(object):
            list[WWSample]: list of sample objects
        """        
        def search_df_for_sample(sample_rsl:str):
-            logger.debug(f"Attempting to find sample {sample_rsl} in \n {self.elution_map}")
+            # logger.debug(f"Attempting to find sample {sample_rsl} in \n {self.elution_map}")
            well = self.elution_map.where(self.elution_map==sample_rsl)
            # logger.debug(f"Well: {well}")
            well = well.dropna(how='all').dropna(axis=1, how="all")
            if well.size > 1:
                well = well.iloc[0].to_frame().dropna().T
-            logger.debug(f"well {sample_rsl} post processing: {well.size}: {type(well)}, {well.index[0]}, {well.columns[0]}")
-            self.elution_map.at[well.index[0], well.columns[0]] = np.nan
+            logger.debug(f"well {sample_rsl} post processing: {well.size}: {type(well)}")#, {well.index[0]}, {well.columns[0]}")
            try:
-                col = str(int(well.columns[0])).zfill(2)
-            except ValueError:
-                col = str(well.columns[0]).zfill(2)
+                self.elution_map.at[well.index[0], well.columns[0]] = np.nan
+            except IndexError as e:
+                logger.error(f"Couldn't find the well for {sample_rsl}")
+                return 0, 0
+            try:
+                column = int(well.columns[0])
            except TypeError as e:
                logger.error(f"Problem parsing out column number for {well}:\n {e}")
-            return f"{well.index[0]}{col}"
+            row = row_keys[well.index[0]]
+            return row, column
        new_list = []
        return_val = None
        for sample in self.samples:
-            new = WWSample()
-            if check_not_nan(sample["Unnamed: 7"]):
-                new.rsl_number = sample['Unnamed: 7'] # previously Unnamed: 9
+            logger.debug(f"Sample: {sample}")
+            instance = lookup_ww_sample_by_ww_sample_num(ctx=self.ctx, sample_number=sample['Unnamed: 3'])
+            if instance == None:
+                instance = WastewaterSample()
+                if check_not_nan(sample["Unnamed: 7"]):
+                    if sample["Unnamed: 7"] != "Fixed" and sample['Unnamed: 7'] != "Flex":
+                        instance.rsl_number = sample['Unnamed: 7'] # previously Unnamed: 9
+                    elif check_not_nan(sample['Unnamed: 9']):
+                        instance.rsl_number = sample['Unnamed: 9'] # previously Unnamed: 9
+                    else:
+                        logger.error(f"No RSL sample number found for this sample.")
+                        continue
+                else:
+                    logger.error(f"No RSL sample number found for this sample.")
+                    continue
+                instance.ww_processing_num = sample['Unnamed: 2']
+                # need to ensure we have a sample id for database integrity
+                # if we don't have a sample full id, make one up
+                if check_not_nan(sample['Unnamed: 3']):
+                    logger.debug(f"Sample name: {sample['Unnamed: 3']}")
+                    instance.submitter_id = sample['Unnamed: 3']
+                else:
+                    instance.submitter_id = uuid.uuid4().hex.upper()
+                # logger.debug(f"The Submitter sample id is: {instance.submitter_id}")
+                # need to ensure we get a collection date
+                if check_not_nan(sample['Unnamed: 5']):
+                    instance.collection_date = sample['Unnamed: 5']
+                else:
+                    instance.collection_date = date.today()
+                # new.testing_type = sample['Unnamed: 6']
+                # new.site_status = sample['Unnamed: 7']
+                instance.notes = str(sample['Unnamed: 6']) # previously Unnamed: 8
+                instance.well_24 = sample['Unnamed: 1']
            else:
-                logger.error(f"No RSL sample number found for this sample.")
-                continue
-            new.ww_processing_num = sample['Unnamed: 2']
-            # need to ensure we have a sample id for database integrity
-            # if we don't have a sample full id, make one up
-            if check_not_nan(sample['Unnamed: 3']):
-                new.ww_sample_full_id = sample['Unnamed: 3']
-            else:
-                new.ww_sample_full_id = uuid.uuid4().hex.upper()
-            # need to ensure we get a collection date
-            if check_not_nan(sample['Unnamed: 5']):
-                new.collection_date = sample['Unnamed: 5']
-            else:
-                new.collection_date = date.today()
-            # new.testing_type = sample['Unnamed: 6']
-            # new.site_status = sample['Unnamed: 7']
-            new.notes = str(sample['Unnamed: 6']) # previously Unnamed: 8
-            new.well_24 = sample['Unnamed: 1']
-            elu_well = search_df_for_sample(new.rsl_number)
-            if elu_well != None:
-                row = elu_well[0]
-                col = elu_well[1:].zfill(2)
-                new.well_number = f"{row}{col}"
-            else:
-                # try:
-                return_val += f"{new.rsl_number}\n"
-                # except TypeError:
-                    # return_val = f"{new.rsl_number}\n"
-            new_list.append(new)
+                # What to do if the sample already exists 
+                assert isinstance(instance, WastewaterSample)
+                if instance.rsl_number == None:
+                    if check_not_nan(sample["Unnamed: 7"]):
+                        if sample["Unnamed: 7"] != "Fixed" and sample['Unnamed: 7'] != "Flex":
+                            instance.rsl_number = sample['Unnamed: 7'] # previously Unnamed: 9
+                        elif check_not_nan(sample['Unnamed: 9']):
+                            instance.rsl_number = sample['Unnamed: 9'] # previously Unnamed: 9
+                    else:
+                        logger.error(f"No RSL sample number found for this sample.")
+                if instance.collection_date == None:
+                    if check_not_nan(sample['Unnamed: 5']):
+                        instance.collection_date = sample['Unnamed: 5']
+                    else:
+                        instance.collection_date = date.today()
+                if instance.notes == None:
+                    instance.notes = str(sample['Unnamed: 6']) # previously Unnamed: 8
+                if instance.well_24 == None:
+                    instance.well_24 = sample['Unnamed: 1']
+                logger.debug(f"Already have that sample, going to add association to this plate.")
+            row, column = search_df_for_sample(instance.rsl_number)
+            # if elu_well != None:
+            #     row = elu_well[0]
+            #     col = elu_well[1:].zfill(2)
+            #     # new.well_number = f"{row}{col}"
+            # else:
+            #     # try:
+            #     return_val += f"{new.rsl_number}\n"
+            #     # except TypeError:
+            #         # return_val = f"{new.rsl_number}\n"
+            new_list.append(dict(sample=instance, row=row, column=column))
        return return_val, new_list
    
-    def parse_wastewater_artic_samples(self) -> Tuple[str|None, list[WWSample]]:
+    def parse_wastewater_artic_samples(self) -> Tuple[str|None, list[WastewaterSample]]:
        """
        The artic samples are the wastewater samples that are to be sequenced
        So we will need to lookup existing ww samples and append Artic well # and plate relation
@@ -469,27 +598,32 @@ class SampleParser(object):
        Returns:
            list[WWSample]: list of wastewater samples to be updated
        """        
+        
        new_list = []
        missed_samples = []
        for sample in self.samples:
            with self.ctx.database_session.no_autoflush:
                instance = lookup_ww_sample_by_ww_sample_num(ctx=self.ctx, sample_number=sample['sample_name'])
-            logger.debug(f"Checking: {sample['sample_name']}")
+            logger.debug(f"Checking: {sample}")
            if instance == None:
-                logger.error(f"Unable to find match for: {sample['sample_name']}")
+                logger.error(f"Unable to find match for: {sample['sample_name']}. Making new instance using {sample['submitter_id']}.")
+                instance = WastewaterSample()
+                instance.ww_processing_num = sample['sample_name']
+                instance.submitter_id = sample['submitter_id']
                missed_samples.append(sample['sample_name'])
-                continue
-            logger.debug(f"Got instance: {instance.ww_sample_full_id}")
-            if sample['well'] != None:
-                row = sample['well'][0]
-                col = sample['well'][1:].zfill(2)
-                sample['well'] = f"{row}{col}"
-            instance.artic_well_number = sample['well']
-            new_list.append(instance)
+                # continue
+            logger.debug(f"Got instance: {instance.submitter_id}")
+            # if sample['row'] != None:
+            #     row = int(row_keys[sample['well'][0]])
+            # if sample['column'] != None:                
+            #     column = int(sample['well'][1:])
+                # sample['well'] = f"{row}{col}"
+            # instance.artic_well_number = sample['well']
+            if instance.submitter_id != "NTC1" and instance.submitter_id != "NTC2":
+                new_list.append(dict(sample=instance, row=sample['row'], column=sample['column']))
        missed_str = "\n\t".join(missed_samples)
        return f"Could not find matches for the following samples:\n\t {missed_str}", new_list

-
 class PCRParser(object):
    """
    Object to pull data from Design and Analysis PCR export file.