Large scale refactor to improve db efficiency

This commit is contained in:
Landon Wark
2023-09-27 14:16:28 -05:00
parent 82ab06efad
commit e484eabb22
37 changed files with 1782 additions and 1697 deletions

View File

@@ -6,15 +6,14 @@ import pprint
from typing import List
import pandas as pd
from pathlib import Path
from backend.db import lookup_sample_by_submitter_id, get_reagents_in_extkit, lookup_kittype_by_name, lookup_submissiontype_by_name, models
from backend.db import models, lookup_kit_types, lookup_submission_type, lookup_samples
from backend.pydant import PydSubmission, PydReagent
import logging
from collections import OrderedDict
import re
import numpy as np
from datetime import date
from dateutil.parser import parse, ParserError
from tools import check_not_nan, RSLNamer, convert_nans_to_nones, Settings, convert_well_to_row_column
from tools import check_not_nan, RSLNamer, convert_nans_to_nones, Settings
from frontend.custom_widgets.pop_ups import SubmissionTypeSelector, KitSelector
logger = logging.getLogger(f"submissions.{__name__}")
@@ -106,11 +105,14 @@ class SheetParser(object):
self.sub[k] = v
logger.debug(f"Parser.sub after info scrape: {pprint.pformat(self.sub)}")
def parse_reagents(self):
def parse_reagents(self, extraction_kit:str|None=None):
"""
Pulls reagent info from the excel sheet
"""
self.sub['reagents'] = ReagentParser(ctx=self.ctx, xl=self.xl, submission_type=self.sub['submission_type'], extraction_kit=self.sub['extraction_kit']).parse_reagents()
if extraction_kit == None:
extraction_kit = extraction_kit=self.sub['extraction_kit']
logger.debug(f"Parsing reagents for {extraction_kit}")
self.sub['reagents'] = ReagentParser(ctx=self.ctx, xl=self.xl, submission_type=self.sub['submission_type'], extraction_kit=extraction_kit).parse_reagents()
def parse_samples(self):
"""
@@ -180,7 +182,8 @@ class SheetParser(object):
"""
Enforce that only allowed reagents get into the Pydantic Model
"""
allowed_reagents = [item.name for item in get_reagents_in_extkit(ctx=self.ctx, kit_name=self.sub['extraction_kit']['value'])]
kit = lookup_kit_types(ctx=self.ctx, name=self.sub['extraction_kit']['value'])
allowed_reagents = [item.name for item in kit.get_reagents()]
logger.debug(f"List of reagents for comparison with allowed_reagents: {pprint.pformat(self.sub['reagents'])}")
self.sub['reagents'] = [reagent for reagent in self.sub['reagents'] if reagent['value'].type in allowed_reagents]
@@ -217,7 +220,8 @@ class InfoParser(object):
if isinstance(submission_type, str):
submission_type = dict(value=submission_type, parsed=False)
logger.debug(f"Looking up submission type: {submission_type['value']}")
submission_type = lookup_submissiontype_by_name(ctx=self.ctx, type_name=submission_type['value'])
# submission_type = lookup_submissiontype_by_name(ctx=self.ctx, type_name=submission_type['value'])
submission_type = lookup_submission_type(ctx=self.ctx, name=submission_type['value'])
info_map = submission_type.info_map
return info_map
@@ -269,7 +273,9 @@ class ReagentParser(object):
self.xl = xl
def fetch_kit_info_map(self, extraction_kit:dict, submission_type:str):
kit = lookup_kittype_by_name(ctx=self.ctx, name=extraction_kit['value'])
if isinstance(extraction_kit, dict):
extraction_kit = extraction_kit['value']
kit = lookup_kit_types(ctx=self.ctx, name=extraction_kit)
if isinstance(submission_type, dict):
submission_type = submission_type['value']
reagent_map = kit.construct_xl_map_for_use(submission_type.title())
@@ -300,9 +306,9 @@ class ReagentParser(object):
logger.debug(f"Got lot for {item}-{name}: {lot} as {type(lot)}")
lot = str(lot)
listo.append(dict(value=PydReagent(type=item.strip(), lot=lot, exp=expiry, name=name), parsed=parsed))
logger.debug(f"Returning listo: {listo}")
return listo
class SampleParser(object):
"""
object to pull data for samples in excel sheet and construct individual sample objects
@@ -331,23 +337,48 @@ class SampleParser(object):
if isinstance(self.lookup_table, pd.DataFrame):
self.parse_lookup_table()
def fetch_sample_info_map(self, submission_type:dict) -> dict:
def fetch_sample_info_map(self, submission_type:str) -> dict:
"""
Gets info locations in excel book for submission type.
Args:
submission_type (str): submission type
Returns:
dict: Info locations.
"""
logger.debug(f"Looking up submission type: {submission_type}")
submission_type = lookup_submissiontype_by_name(ctx=self.ctx, type_name=submission_type)
submission_type = lookup_submission_type(ctx=self.ctx, name=submission_type)
logger.debug(f"info_map: {pprint.pformat(submission_type.info_map)}")
sample_info_map = submission_type.info_map['samples']
return sample_info_map
def construct_plate_map(self, plate_map_location:dict) -> pd.DataFrame:
"""
Gets location of samples from plate map grid in excel sheet.
Args:
plate_map_location (dict): sheet name, start/end row/column
Returns:
pd.DataFrame: Plate map grid
"""
df = self.xl.parse(plate_map_location['sheet'], header=None, dtype=object)
df = df.iloc[plate_map_location['start_row']-1:plate_map_location['end_row'], plate_map_location['start_column']-1:plate_map_location['end_column']]
# logger.debug(f"Input dataframe for plate map: {df}")
df = pd.DataFrame(df.values[1:], columns=df.iloc[0])
df = df.set_index(df.columns[0])
# logger.debug(f"Output dataframe for plate map: {df}")
return df
def construct_lookup_table(self, lookup_table_location) -> pd.DataFrame:
def construct_lookup_table(self, lookup_table_location:dict) -> pd.DataFrame:
"""
Gets table of misc information from excel book
Args:
lookup_table_location (dict): sheet name, start/end row
Returns:
pd.DataFrame: _description_
"""
try:
df = self.xl.parse(lookup_table_location['sheet'], header=None, dtype=object)
except KeyError:
@@ -355,16 +386,17 @@ class SampleParser(object):
df = df.iloc[lookup_table_location['start_row']-1:lookup_table_location['end_row']]
df = pd.DataFrame(df.values[1:], columns=df.iloc[0])
df = df.reset_index(drop=True)
# logger.debug(f"Dataframe for lookup table: {df}")
return df
def create_basic_dictionaries_from_plate_map(self):
"""
Parse sample location/name from plate map
"""
invalids = [0, "0", "EMPTY"]
new_df = self.plate_map.dropna(axis=1, how='all')
columns = new_df.columns.tolist()
for _, iii in new_df.iterrows():
for c in columns:
# logger.debug(f"Checking sample {iii[c]}")
if check_not_nan(iii[c]):
if iii[c] in invalids:
logger.debug(f"Invalid sample name: {iii[c]}, skipping.")
@@ -378,8 +410,10 @@ class SampleParser(object):
self.samples.append(dict(submitter_id=id, row=row_keys[iii._name], column=c))
def parse_lookup_table(self):
"""
Parse misc info from lookup table.
"""
def determine_if_date(input_str) -> str|date:
# logger.debug(f"Looks like we have a str: {input_str}")
regex = re.compile(r"^\d{4}-?\d{2}-?\d{2}")
if bool(regex.search(input_str)):
logger.warning(f"{input_str} is a date!")
@@ -407,11 +441,19 @@ class SampleParser(object):
sample[k] = v
logger.debug(f"Output sample dict: {sample}")
def parse_samples(self, generate:bool=True) -> List[dict]:
def parse_samples(self, generate:bool=True) -> List[dict]|List[models.BasicSample]:
"""
Parse merged platemap\lookup info into dicts/samples
Args:
generate (bool, optional): Indicates if sample objects to be generated from dicts. Defaults to True.
Returns:
List[dict]|List[models.BasicSample]: List of samples
"""
result = None
new_samples = []
for ii, sample in enumerate(self.samples):
# logger.debug(f"\n\n{new_samples}\n\n")
try:
if sample['submitter_id'] in [check_sample['sample'].submitter_id for check_sample in new_samples]:
sample['submitter_id'] = f"{sample['submitter_id']}-{ii}"
@@ -432,7 +474,6 @@ class SampleParser(object):
translated_dict[k] = convert_nans_to_nones(v)
translated_dict['sample_type'] = f"{self.submission_type} Sample"
parser_query = f"parse_{translated_dict['sample_type'].replace(' ', '_').lower()}"
# logger.debug(f"New sample dictionary going into object creation:\n{translated_dict}")
try:
custom_parser = getattr(self, parser_query)
translated_dict = custom_parser(translated_dict)
@@ -445,6 +486,15 @@ class SampleParser(object):
return result, new_samples
def generate_sample_object(self, input_dict) -> models.BasicSample:
"""
Constructs sample object from dict
Args:
input_dict (dict): sample information
Returns:
models.BasicSample: Sample object
"""
query = input_dict['sample_type'].replace(" ", "")
try:
database_obj = getattr(models, query)
@@ -452,13 +502,12 @@ class SampleParser(object):
logger.error(f"Could not find the model {query}. Using generic.")
database_obj = models.BasicSample
logger.debug(f"Searching database for {input_dict['submitter_id']}...")
instance = lookup_sample_by_submitter_id(ctx=self.ctx, submitter_id=input_dict['submitter_id'])
instance = lookup_samples(ctx=self.ctx, submitter_id=input_dict['submitter_id'])
if instance == None:
logger.debug(f"Couldn't find sample {input_dict['submitter_id']}. Creating new sample.")
instance = database_obj()
for k,v in input_dict.items():
try:
# setattr(instance, k, v)
instance.set_attribute(k, v)
except Exception as e:
logger.error(f"Failed to set {k} due to {type(e).__name__}: {e}")
@@ -511,12 +560,27 @@ class SampleParser(object):
return input_dict
def parse_first_strand_sample(self, input_dict:dict) -> dict:
"""
Update sample dictionary with first strand specific information
Args:
input_dict (dict): Input sample dictionary
Returns:
dict: Updated sample dictionary
"""
logger.debug("Called first strand sample parser")
input_dict['well'] = re.search(r"\s\((.*)\)$", input_dict['submitter_id']).groups()[0]
input_dict['submitter_id'] = re.sub(r"\s\(.*\)$", "", str(input_dict['submitter_id'])).strip()
return input_dict
def grab_plates(self):
def grab_plates(self) -> List[str]:
"""
Parse plate names from
Returns:
List[str]: list of plate names.
"""
plates = []
for plate in self.plates:
df = self.xl.parse(plate['sheet'], header=None)
@@ -526,8 +590,7 @@ class SampleParser(object):
continue
plates.append(output)
return plates
class PCRParser(object):
"""
Object to pull data from Design and Analysis PCR export file.
@@ -574,7 +637,6 @@ class PCRParser(object):
sheet_name (str): Name of sheet in excel workbook that holds info.
"""
df = self.xl.parse(sheet_name=sheet_name, dtype=object).fillna("")
# self.pcr['file'] = df.iloc[1][1]
self.pcr['comment'] = df.iloc[0][1]
self.pcr['operator'] = df.iloc[1][1]
self.pcr['barcode'] = df.iloc[2][1]
@@ -615,7 +677,6 @@ class PCRParser(object):
except ValueError:
logger.error("Well call number doesn't match sample number")
logger.debug(f"Well call df: {well_call_df}")
# iloc is [row][column]
for ii, row in self.samples_df.iterrows():
try:
sample_obj = [sample for sample in self.samples if sample['sample'] == row[3]][0]
@@ -623,14 +684,8 @@ class PCRParser(object):
sample_obj = dict(
sample = row['Sample'],
plate_rsl = self.plate_num,
# elution_well = row['Well Position']
)
logger.debug(f"Got sample obj: {sample_obj}")
# logger.debug(f"row: {row}")
# rsl_num = row[3]
# # logger.debug(f"Looking up: {rsl_num}")
# ww_samp = lookup_ww_sample_by_rsl_sample_number(ctx=self.ctx, rsl_number=rsl_num)
# logger.debug(f"Got: {ww_samp}")
if isinstance(row['Cq'], float):
sample_obj[f"ct_{row['Target'].lower()}"] = row['Cq']
else:
@@ -639,20 +694,6 @@ class PCRParser(object):
sample_obj[f"{row['Target'].lower()}_status"] = row['Assessment']
except KeyError:
logger.error(f"No assessment for {sample_obj['sample']}")
# match row["Target"]:
# case "N1":
# if isinstance(row['Cq'], float):
# sample_obj['ct_n1'] = row["Cq"]
# else:
# sample_obj['ct_n1'] = 0.0
# sample_obj['n1_status'] = row['Assessment']
# case "N2":
# if isinstance(row['Cq'], float):
# sample_obj['ct_n2'] = row['Assessment']
# else:
# sample_obj['ct_n2'] = 0.0
# case _:
# logger.warning(f"Unexpected input for row[4]: {row["Target"]}")
self.samples.append(sample_obj)

View File

@@ -14,7 +14,7 @@ env = jinja_template_loading()
logger = logging.getLogger(f"submissions.{__name__}")
def make_report_xlsx(records:list[dict]) -> DataFrame:
def make_report_xlsx(records:list[dict]) -> Tuple[DataFrame, DataFrame]:
"""
create the dataframe for a report
@@ -92,7 +92,6 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -
"""
df = DataFrame.from_records(input)
# df.to_excel("test.xlsx", engine="openpyxl")
safe = ['name', 'submitted_date', 'genus', 'target']
for column in df.columns:
if "percent" in column:
@@ -102,7 +101,6 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -
if column not in safe:
if subtype != None and column != subtype:
del df[column]
# logger.debug(df)
# move date of sample submitted on same date as previous ahead one.
df = displace_date(df)
# ad hoc method to make data labels more accurate.
@@ -215,14 +213,10 @@ def drop_reruns_from_df(ctx:dict, df: DataFrame) -> DataFrame:
"""
if 'rerun_regex' in ctx:
sample_names = get_unique_values_in_df_column(df, column_name="name")
# logger.debug(f"Compiling regex from: {settings['rerun_regex']}")
rerun_regex = re.compile(fr"{ctx['rerun_regex']}")
for sample in sample_names:
# logger.debug(f'Running search on {sample}')
if rerun_regex.search(sample):
# logger.debug(f'Match on {sample}')
first_run = re.sub(rerun_regex, "", sample)
# logger.debug(f"First run: {first_run}")
df = df.drop(df[df.name == first_run].index)
return df