Pydantic added for validation.

2023-07-07 14:27:26 -05:00
parent 0c81c74f70
commit 1c804bfc6a
12 changed files with 497 additions and 141 deletions
--- a/src/submissions/backend/excel/parser.py
+++ b/src/submissions/backend/excel/parser.py
@@ -8,6 +8,7 @@ import pandas as pd
 from pathlib import Path
 from backend.db.models import WWSample, BCSample
 from backend.db import lookup_ww_sample_by_ww_sample_num
+from backend.pydant import PydSubmission
 import logging
 from collections import OrderedDict
 import re
@@ -149,19 +150,22 @@ class SheetParser(object):
                    else:
                        logger.debug(f"Date: {row[3]}")
                        expiry = date.today()
-                    self.sub[f"lot_{reagent_type}"] = {'lot':output_var, 'exp':expiry}
+                    # self.sub[f"lot_{reagent_type}"] = {'lot':output_var, 'exp':expiry}
+                    self.sub['reagents'].append(dict(type=reagent_type, lot=output_var, exp=expiry))
        submission_info = self.parse_generic("Sample List")
        # iloc is [row][column] and the first row is set as header row so -2
        tech = str(submission_info.iloc[11][1])
-        if tech == "nan":
-            tech = "Unknown"
-        elif len(tech.split(",")) > 1:
-            tech_reg = re.compile(r"[A-Z]{2}")
-            tech = ", ".join(tech_reg.findall(tech))
+        # moved to pydantic model
+        # if tech == "nan":
+        #     tech = "Unknown"
+        # elif len(tech.split(",")) > 1:
+        #     tech_reg = re.compile(r"[A-Z]{2}")
+        #     tech = ", ".join(tech_reg.findall(tech))
        self.sub['technician'] = tech
        # reagents
        # must be prefixed with 'lot_' to be recognized by gui
-        # Todo: find a more adaptable way to read reagents.
+        # TODO: find a more adaptable way to read reagents.
+        self.sub['reagents'] = []
        reagent_range = submission_info.iloc[1:14, 4:8]
        logger.debug(reagent_range)
        parse_reagents(reagent_range)
@@ -210,7 +214,8 @@ class SheetParser(object):
                            expiry = date.today()
                    else:
                        expiry = date.today()
-                    self.sub[f"lot_{output_key}"] = {'lot':output_var, 'exp':expiry}
+                    # self.sub[f"lot_{output_key}"] = {'lot':output_var, 'exp':expiry}
+                    self.sub['reagents'].append(dict(type=output_key, lot=output_var, exp=expiry))
        # parse submission sheet
        submission_info = self.parse_generic("WW Submissions (ENTER HERE)")
        # parse enrichment sheet
@@ -227,6 +232,7 @@ class SheetParser(object):
        pcr_reagent_range = qprc_info.iloc[0:5, 9:20]
        # compile technician info
        self.sub['technician'] = f"Enr: {enrichment_info.columns[2]}, Ext: {extraction_info.columns[2]}, PCR: {qprc_info.columns[2]}"
+        self.sub['reagents'] = []
        parse_reagents(enr_reagent_range)
        parse_reagents(ext_reagent_range)
        parse_reagents(pcr_reagent_range)
@@ -271,7 +277,7 @@ class SheetParser(object):
                    else:
                        logger.debug(f"Date: {row[2]}")
                        expiry = date.today()
-                    self.sub[f"lot_{output_key}"] = {'lot':output_var, 'exp':expiry}
+                    self.sub['reagents'].append(dict(type=output_key, lot=output_var, exp=expiry))
                else:
                    continue
        def massage_samples(df:pd.DataFrame) -> pd.DataFrame:
@@ -303,6 +309,7 @@ class SheetParser(object):
        self.sub['sample_count'] = submission_info.iloc[4][6]
        self.sub['extraction_kit'] = "ArticV4.1"
        self.sub['technician'] = f"MM: {biomek_info.iloc[2][1]}, Bio: {biomek_info.iloc[3][1]}"
+        self.sub['reagents'] = []
        parse_reagents(sub_reagent_range)
        parse_reagents(biomek_reagent_range)
        samples = massage_samples(biomek_info.iloc[22:31, 0:])
@@ -311,6 +318,18 @@ class SheetParser(object):
        self.sample_result, self.sub['samples'] = sample_parse()
        

+    def to_pydantic(self) -> PydSubmission:
+        """
+        Generates a pydantic model of scraped data for validation
+
+        Returns:
+            PydSubmission: output pydantic model
+        """        
+        psm = PydSubmission(filepath=self.filepath, **self.sub)
+        delattr(psm, "filepath")
+        return psm
+
+

 class SampleParser(object):
    """
@@ -366,7 +385,7 @@ class SampleParser(object):
            list[WWSample]: list of sample objects
        """        
        def search_df_for_sample(sample_rsl:str):
-            logger.debug(f"Attempting to find sample {sample_rsl} in \n {self.elution_map}")
+            # logger.debug(f"Attempting to find sample {sample_rsl} in \n {self.elution_map}")
            well = self.elution_map.where(self.elution_map==sample_rsl)
            # logger.debug(f"Well: {well}")
            well = well.dropna(how='all').dropna(axis=1, how="all")
--- a/src/submissions/backend/excel/reports.py
+++ b/src/submissions/backend/excel/reports.py
@@ -9,6 +9,7 @@ import sys
 from pathlib import Path
 import re
 from tools import check_if_app
+from typing import Tuple

 logger = logging.getLogger(f"submissions.{__name__}")

@@ -154,23 +155,61 @@ def displace_date(df:DataFrame) -> DataFrame:
    # get submitted dates for each control
    dict_list = [dict(name=item, date=df[df.name == item].iloc[0]['submitted_date']) for item in sorted(df['name'].unique())]
    previous_dates = []
-    for ii, item in enumerate(dict_list):
-        try:
-            # check = item['date'] == dict_list[ii-1]['date']
-            check = item['date'] in previous_dates
-        except IndexError:
-            check = False
-        if check:
-            # occurences = previous_dates.count(item['date'])
-            logger.debug(f"We found one! Increment date!\n\t{item['date'] - timedelta(days=1)}")
-            # get df locations where name == item name
-            mask = df['name'] == item['name']
-            # increment date in dataframe
-            df.loc[mask, 'submitted_date'] = df.loc[mask, 'submitted_date'].apply(lambda x: x + timedelta(days=1))
-            previous_dates.append(item['date'] + timedelta(days=1))
-        else:
-            previous_dates.append(item['date'])
+    for _, item in enumerate(dict_list):
+        # try:
+        #     # check = item['date'] == dict_list[ii-1]['date']
+        #     check = item['date'] in previous_dates
+        # except IndexError:
+        #     check = False
+        # if check:
+        #     # occurences = previous_dates.count(item['date'])
+        #     logger.debug(f"We found one! Increment date!\n\t{item['date']} to {item['date'] + timedelta(days=1)}")
+        #     # get df locations where name == item name
+        #     mask = df['name'] == item['name']
+        #     # increment date in dataframe
+        #     df.loc[mask, 'submitted_date'] = df.loc[mask, 'submitted_date'].apply(lambda x: x + timedelta(days=1))
+        #     outdate = item['date'] + timedelta(days=1)
+        #     # previous_dates.append(item['date'] + timedelta(days=1))
+        # else:
+        #     outdate = item['date']
+        # previous_dates.append(outdate)
+        # logger.debug(f"\n\tCurrent date: {outdate}\n\tPrevious dates:{previous_dates}")
+        # logger.debug(type(item))
+        df, previous_dates = check_date(df=df, item=item, previous_dates=previous_dates)
    return df
+
+def check_date(df:DataFrame, item:dict, previous_dates:list) -> Tuple[DataFrame, list]:
+    
+    try:
+        # check = item['date'] == dict_list[ii-1]['date']
+        check = item['date'] in previous_dates
+    except IndexError:
+        check = False
+    previous_dates.append(item['date'])
+    if check:
+        # occurences = previous_dates.count(item['date'])
+        logger.debug(f"We found one! Increment date!\n\t{item['date']} to {item['date'] + timedelta(days=1)}")
+        # get df locations where name == item name
+        mask = df['name'] == item['name']
+        # increment date in dataframe
+        df.loc[mask, 'submitted_date'] = df.loc[mask, 'submitted_date'].apply(lambda x: x + timedelta(days=1))
+        
+        item['date'] += timedelta(days=1)
+        # previous_dates.append(item['date'] + timedelta(days=1))
+        passed = False
+    else:
+        passed = True
+    logger.debug(f"\n\tCurrent date: {item['date']}\n\tPrevious dates:{previous_dates}")
+    logger.debug(f"DF: {type(df)}, previous_dates: {type(previous_dates)}")
+    # if run didn't lead to changed date, return values
+    if passed:
+        logger.debug(f"Date check passed, returning.")
+        return df, previous_dates
+    # if date was changed, rerun with new date
+    else:
+        logger.warning(f"Date check failed, running recursion")
+        df, previous_dates = check_date(df, item, previous_dates)
+        return df, previous_dates
                

 def get_unique_values_in_df_column(df: DataFrame, column_name: str) -> list: