documentation and converted to username based exclusion of adding new kits

2023-01-30 12:07:38 -06:00
parent bbb65d3fe6
commit 1f832dccf2
16 changed files with 876 additions and 296 deletions
--- a/src/submissions/backend/excel/init.py
+++ b/src/submissions/backend/excel/init.py
@@ -1,4 +1,3 @@
-
 from pandas import DataFrame
 import re

@@ -6,14 +5,14 @@ import re

 def get_unique_values_in_df_column(df: DataFrame, column_name: str) -> list:
    """
-    _summary_
+    get all unique values in a dataframe column by name

    Args:
-        df (DataFrame): _description_
-        column_name (str): _description_
+        df (DataFrame): input dataframe
+        column_name (str): name of column of interest

    Returns:
-        list: _description_
+        list: sorted list of unique values
    """    
    return sorted(df[column_name].unique())

@@ -23,7 +22,7 @@ def drop_reruns_from_df(ctx:dict, df: DataFrame) -> DataFrame:
    Removes semi-duplicates from dataframe after finding sequencing repeats.

    Args:
-        settings (dict): settings passed down from click
+        settings (dict): settings passed from gui
        df (DataFrame): initial dataframe

    Returns:
--- a/src/submissions/backend/excel/parser.py
+++ b/src/submissions/backend/excel/parser.py
@@ -11,40 +11,68 @@ import uuid
 logger = logging.getLogger(f"submissions.{__name__}")

 class SheetParser(object):
-
-    def __init__(self, filepath:Path|None = None, **kwargs):
+    """
+    object to pull and contain data from excel file
+    """
+    def __init__(self, filepath:Path|None = None, **kwargs) -> None:
+        """
+        Args:
+            filepath (Path | None, optional): file path to excel sheet. Defaults to None.
+        """
        logger.debug(f"Parsing {filepath.__str__()}")
+        # set attributes based on kwargs from gui ctx
        for kwarg in kwargs:
            setattr(self, f"_{kwarg}", kwargs[kwarg])
        if filepath == None:
-            logger.debug(f"No filepath.")
+            logger.error(f"No filepath given.")
            self.xl = None
        else:
-            
            try:
                self.xl = pd.ExcelFile(filepath.__str__())
-            except ValueError:
+            except ValueError as e:
+                logger.error(f"Incorrect value: {e}")
                self.xl = None
        self.sub = OrderedDict()
-        self.sub['submission_type'] = self._type_decider()        
+        # make decision about type of sample we have
+        self.sub['submission_type'] = self._type_decider()
+        # select proper parser based on sample type
        parse_sub = getattr(self, f"_parse_{self.sub['submission_type'].lower()}")
        parse_sub()

-    def _type_decider(self):
+    def _type_decider(self) -> str:
+        """
+        makes decisions about submission type based on structure of excel file
+
+        Returns:
+            str: submission type name
+        """        
        try:
            for type in self._submission_types:
                if self.xl.sheet_names == self._submission_types[type]['excel_map']:
                    return type.title()
            return "Unknown"
-        except:
+        except Exception as e:
+            logger.warning(f"We were unable to parse the submission type due to: {e}")
            return "Unknown"


-    def _parse_unknown(self):
+    def _parse_unknown(self) -> None:
+        """
+        Dummy function to handle unknown excel structures
+        """        
        self.sub = None
    

-    def _parse_generic(self, sheet_name:str):
+    def _parse_generic(self, sheet_name:str) -> pd.DataFrame:
+        """
+        Pulls information common to all submission types and passes on dataframe
+
+        Args:
+            sheet_name (str): name of excel worksheet to pull from
+
+        Returns:
+            pd.DataFrame: relevant dataframe from excel sheet
+        """        
        submission_info = self.xl.parse(sheet_name=sheet_name, dtype=object)
        
        self.sub['submitter_plate_num'] = submission_info.iloc[0][1] #if pd.isnull(submission_info.iloc[0][1]) else string_formatter(submission_info.iloc[0][1])
@@ -57,7 +85,10 @@ class SheetParser(object):
        return submission_info


-    def _parse_bacterial_culture(self):
+    def _parse_bacterial_culture(self) -> None:
+        """
+        pulls info specific to bacterial culture sample type
+        """        
        submission_info = self._parse_generic("Sample List")
        # iloc is [row][column] and the first row is set as header row so -2
        tech = str(submission_info.iloc[11][1])
@@ -68,7 +99,7 @@ class SheetParser(object):
            tech = ", ".join(tech_reg.findall(tech))
        self.sub['technician'] = tech
        # reagents
-        
+        # must be prefixed with 'lot_' to be recognized by gui
        self.sub['lot_wash_1'] = submission_info.iloc[1][6] #if pd.isnull(submission_info.iloc[1][6]) else string_formatter(submission_info.iloc[1][6])
        self.sub['lot_wash_2'] = submission_info.iloc[2][6] #if pd.isnull(submission_info.iloc[2][6]) else string_formatter(submission_info.iloc[2][6])
        self.sub['lot_binding_buffer'] = submission_info.iloc[3][6] #if pd.isnull(submission_info.iloc[3][6]) else string_formatter(submission_info.iloc[3][6])
@@ -79,13 +110,17 @@ class SheetParser(object):
        self.sub['lot_ethanol'] = submission_info.iloc[10][6] #if pd.isnull(submission_info.iloc[10][6]) else string_formatter(submission_info.iloc[10][6])
        self.sub['lot_positive_control'] = submission_info.iloc[103][1] #if pd.isnull(submission_info.iloc[103][1]) else string_formatter(submission_info.iloc[103][1])
        self.sub['lot_plate'] = submission_info.iloc[12][6] #if pd.isnull(submission_info.iloc[12][6]) else string_formatter(submission_info.iloc[12][6])
+        # get individual sample info
        sample_parser = SampleParser(submission_info.iloc[15:111])
        sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
        logger.debug(f"Parser result: {self.sub}")
        self.sub['samples'] = sample_parse()


-    def _parse_wastewater(self):
+    def _parse_wastewater(self) -> None:
+        """
+        pulls info specific to wastewater sample type
+        """        
        # submission_info = self.xl.parse("WW Submissions (ENTER HERE)")
        submission_info = self._parse_generic("WW Submissions (ENTER HERE)")
        enrichment_info = self.xl.parse("Enrichment Worksheet", dtype=object)
@@ -108,19 +143,28 @@ class SheetParser(object):
        self.sub['lot_pre_mix_2'] = qprc_info.iloc[2][14] #if pd.isnull(qprc_info.iloc[2][14]) else string_formatter(qprc_info.iloc[2][14])
        self.sub['lot_positive_control'] = qprc_info.iloc[3][14] #if pd.isnull(qprc_info.iloc[3][14]) else string_formatter(qprc_info.iloc[3][14])
        self.sub['lot_ddh2o'] = qprc_info.iloc[4][14] #if pd.isnull(qprc_info.iloc[4][14]) else string_formatter(qprc_info.iloc[4][14])
+        # gt individual sample info
        sample_parser = SampleParser(submission_info.iloc[16:40])
        sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
        self.sub['samples'] = sample_parse()


 class SampleParser(object):
-
+    """
+    object to pull data for samples in excel sheet and construct individual sample objects
+    """

    def __init__(self, df:pd.DataFrame) -> None:
        self.samples = df.to_dict("records")


    def parse_bacterial_culture_samples(self) -> list[BCSample]:
+        """
+        construct bacterial culture specific sample objects
+
+        Returns:
+            list[BCSample]: list of sample objects
+        """        
        new_list = []
        for sample in self.samples:
            new = BCSample()
@@ -130,6 +174,7 @@ class SampleParser(object):
            new.concentration = sample['Unnamed: 3']
            # logger.debug(f"Sample object: {new.sample_id} = {type(new.sample_id)}")
            logger.debug(f"Got sample_id: {new.sample_id}")
+            # need to exclude empties and blanks
            try:
                not_a_nan = not np.isnan(new.sample_id) and str(new.sample_id).lower() != 'blank'
            except TypeError:
@@ -140,10 +185,17 @@ class SampleParser(object):


    def parse_wastewater_samples(self) -> list[WWSample]:
+        """
+        construct wastewater specific sample objects
+
+        Returns:
+            list[WWSample]: list of sample objects
+        """        
        new_list = []
        for sample in self.samples:
            new = WWSample()
            new.ww_processing_num = sample['Unnamed: 2']
+            # need to ensure we have a sample id for database integrity
            try:
                not_a_nan = not np.isnan(sample['Unnamed: 3'])
            except TypeError:
@@ -153,6 +205,7 @@ class SampleParser(object):
            else:
                new.ww_sample_full_id = uuid.uuid4().hex.upper()
            new.rsl_number = sample['Unnamed: 9']
+            # need to ensure we get a collection date
            try:
                not_a_nan = not np.isnan(sample['Unnamed: 5'])
            except TypeError:
@@ -169,11 +222,11 @@ class SampleParser(object):
        return new_list
    

-def string_formatter(input):
-    logger.debug(f"{input} : {type(input)}")
-    match input:
-        case int() | float() | np.float64:
-            return "{:0.0f}".format(input)
-        case _:
-            return input
+# def string_formatter(input):
+#     logger.debug(f"{input} : {type(input)}")
+#     match input:
+#         case int() | float() | np.float64:
+#             return "{:0.0f}".format(input)
+#         case _:
+#             return input
        
--- a/src/submissions/backend/excel/reports.py
+++ b/src/submissions/backend/excel/reports.py
@@ -8,13 +8,22 @@ import logging
 logger = logging.getLogger(f"submissions.{__name__}")

 def make_report_xlsx(records:list[dict]) -> DataFrame:
+    """
+    create the dataframe for a report
+
+    Args:
+        records (list[dict]): list of dictionaries created from submissions
+
+    Returns:
+        DataFrame: output dataframe
+    """    
    df = DataFrame.from_records(records)
+    # put submissions with the same lab together
    df = df.sort_values("Submitting Lab")
-    # table = df.pivot_table(values="Cost", index=["Submitting Lab", "Extraction Kit"], columns=["Cost", "Sample Count"], aggfunc={'Cost':np.sum,'Sample Count':np.sum})
+    # aggregate cost and sample count columns
    df2 = df.groupby(["Submitting Lab", "Extraction Kit"]).agg({'Cost': ['sum', 'count'], 'Sample Count':['sum']})
-    # df2['Cost'] = df2['Cost'].map('${:,.2f}'.format)
    logger.debug(df2.columns)
-    # df2['Cost']['sum'] = df2['Cost']['sum'].apply('${:,.2f}'.format)
+    # apply formating to cost column
    df2.iloc[:, (df2.columns.get_level_values(1)=='sum') & (df2.columns.get_level_values(0)=='Cost')] = df2.iloc[:, (df2.columns.get_level_values(1)=='sum') & (df2.columns.get_level_values(0)=='Cost')].applymap('${:,.2f}'.format)
    return df2

@@ -65,7 +74,18 @@ def make_report_xlsx(records:list[dict]) -> DataFrame:
 #         dfs['name'] = df
 #     return dfs

-def convert_control_by_mode(ctx:dict, control:models.Control, mode:str):
+def convert_control_by_mode(ctx:dict, control:models.Control, mode:str) -> list[dict]:
+    """
+    split control object into analysis types
+
+    Args:
+        ctx (dict): settings passed from gui
+        control (models.Control): control to be parsed into list
+        mode (str): analysis type
+
+    Returns:
+        list[dict]: list of records
+    """    
    output = []
    data = json.loads(getattr(control, mode))
    for genus in data:
@@ -82,6 +102,17 @@ def convert_control_by_mode(ctx:dict, control:models.Control, mode:str):


 def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -> DataFrame:
+    """
+    Convert list of control records to dataframe
+
+    Args:
+        ctx (dict): settings passed from gui
+        input (list[dict]): list of dictionaries containing records
+        subtype (str | None, optional): _description_. Defaults to None.
+
+    Returns:
+        DataFrame: _description_
+    """    
    df = DataFrame.from_records(input)
    safe = ['name', 'submitted_date', 'genus', 'target']
    logger.debug(df)