documentation and converted to username based exclusion of adding new kits

This commit is contained in:
Landon Wark
2023-01-30 12:07:38 -06:00
parent bbb65d3fe6
commit 1f832dccf2
16 changed files with 876 additions and 296 deletions

View File

@@ -1,4 +1,3 @@
from pandas import DataFrame
import re
@@ -6,14 +5,14 @@ import re
def get_unique_values_in_df_column(df: DataFrame, column_name: str) -> list:
"""
_summary_
get all unique values in a dataframe column by name
Args:
df (DataFrame): _description_
column_name (str): _description_
df (DataFrame): input dataframe
column_name (str): name of column of interest
Returns:
list: _description_
list: sorted list of unique values
"""
return sorted(df[column_name].unique())
@@ -23,7 +22,7 @@ def drop_reruns_from_df(ctx:dict, df: DataFrame) -> DataFrame:
Removes semi-duplicates from dataframe after finding sequencing repeats.
Args:
settings (dict): settings passed down from click
settings (dict): settings passed from gui
df (DataFrame): initial dataframe
Returns:

View File

@@ -11,40 +11,68 @@ import uuid
logger = logging.getLogger(f"submissions.{__name__}")
class SheetParser(object):
def __init__(self, filepath:Path|None = None, **kwargs):
"""
object to pull and contain data from excel file
"""
def __init__(self, filepath:Path|None = None, **kwargs) -> None:
"""
Args:
filepath (Path | None, optional): file path to excel sheet. Defaults to None.
"""
logger.debug(f"Parsing {filepath.__str__()}")
# set attributes based on kwargs from gui ctx
for kwarg in kwargs:
setattr(self, f"_{kwarg}", kwargs[kwarg])
if filepath == None:
logger.debug(f"No filepath.")
logger.error(f"No filepath given.")
self.xl = None
else:
try:
self.xl = pd.ExcelFile(filepath.__str__())
except ValueError:
except ValueError as e:
logger.error(f"Incorrect value: {e}")
self.xl = None
self.sub = OrderedDict()
self.sub['submission_type'] = self._type_decider()
# make decision about type of sample we have
self.sub['submission_type'] = self._type_decider()
# select proper parser based on sample type
parse_sub = getattr(self, f"_parse_{self.sub['submission_type'].lower()}")
parse_sub()
def _type_decider(self):
def _type_decider(self) -> str:
"""
makes decisions about submission type based on structure of excel file
Returns:
str: submission type name
"""
try:
for type in self._submission_types:
if self.xl.sheet_names == self._submission_types[type]['excel_map']:
return type.title()
return "Unknown"
except:
except Exception as e:
logger.warning(f"We were unable to parse the submission type due to: {e}")
return "Unknown"
def _parse_unknown(self):
def _parse_unknown(self) -> None:
"""
Dummy function to handle unknown excel structures
"""
self.sub = None
def _parse_generic(self, sheet_name:str):
def _parse_generic(self, sheet_name:str) -> pd.DataFrame:
"""
Pulls information common to all submission types and passes on dataframe
Args:
sheet_name (str): name of excel worksheet to pull from
Returns:
pd.DataFrame: relevant dataframe from excel sheet
"""
submission_info = self.xl.parse(sheet_name=sheet_name, dtype=object)
self.sub['submitter_plate_num'] = submission_info.iloc[0][1] #if pd.isnull(submission_info.iloc[0][1]) else string_formatter(submission_info.iloc[0][1])
@@ -57,7 +85,10 @@ class SheetParser(object):
return submission_info
def _parse_bacterial_culture(self):
def _parse_bacterial_culture(self) -> None:
"""
pulls info specific to bacterial culture sample type
"""
submission_info = self._parse_generic("Sample List")
# iloc is [row][column] and the first row is set as header row so -2
tech = str(submission_info.iloc[11][1])
@@ -68,7 +99,7 @@ class SheetParser(object):
tech = ", ".join(tech_reg.findall(tech))
self.sub['technician'] = tech
# reagents
# must be prefixed with 'lot_' to be recognized by gui
self.sub['lot_wash_1'] = submission_info.iloc[1][6] #if pd.isnull(submission_info.iloc[1][6]) else string_formatter(submission_info.iloc[1][6])
self.sub['lot_wash_2'] = submission_info.iloc[2][6] #if pd.isnull(submission_info.iloc[2][6]) else string_formatter(submission_info.iloc[2][6])
self.sub['lot_binding_buffer'] = submission_info.iloc[3][6] #if pd.isnull(submission_info.iloc[3][6]) else string_formatter(submission_info.iloc[3][6])
@@ -79,13 +110,17 @@ class SheetParser(object):
self.sub['lot_ethanol'] = submission_info.iloc[10][6] #if pd.isnull(submission_info.iloc[10][6]) else string_formatter(submission_info.iloc[10][6])
self.sub['lot_positive_control'] = submission_info.iloc[103][1] #if pd.isnull(submission_info.iloc[103][1]) else string_formatter(submission_info.iloc[103][1])
self.sub['lot_plate'] = submission_info.iloc[12][6] #if pd.isnull(submission_info.iloc[12][6]) else string_formatter(submission_info.iloc[12][6])
# get individual sample info
sample_parser = SampleParser(submission_info.iloc[15:111])
sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
logger.debug(f"Parser result: {self.sub}")
self.sub['samples'] = sample_parse()
def _parse_wastewater(self):
def _parse_wastewater(self) -> None:
"""
pulls info specific to wastewater sample type
"""
# submission_info = self.xl.parse("WW Submissions (ENTER HERE)")
submission_info = self._parse_generic("WW Submissions (ENTER HERE)")
enrichment_info = self.xl.parse("Enrichment Worksheet", dtype=object)
@@ -108,19 +143,28 @@ class SheetParser(object):
self.sub['lot_pre_mix_2'] = qprc_info.iloc[2][14] #if pd.isnull(qprc_info.iloc[2][14]) else string_formatter(qprc_info.iloc[2][14])
self.sub['lot_positive_control'] = qprc_info.iloc[3][14] #if pd.isnull(qprc_info.iloc[3][14]) else string_formatter(qprc_info.iloc[3][14])
self.sub['lot_ddh2o'] = qprc_info.iloc[4][14] #if pd.isnull(qprc_info.iloc[4][14]) else string_formatter(qprc_info.iloc[4][14])
# gt individual sample info
sample_parser = SampleParser(submission_info.iloc[16:40])
sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
self.sub['samples'] = sample_parse()
class SampleParser(object):
"""
object to pull data for samples in excel sheet and construct individual sample objects
"""
def __init__(self, df:pd.DataFrame) -> None:
self.samples = df.to_dict("records")
def parse_bacterial_culture_samples(self) -> list[BCSample]:
"""
construct bacterial culture specific sample objects
Returns:
list[BCSample]: list of sample objects
"""
new_list = []
for sample in self.samples:
new = BCSample()
@@ -130,6 +174,7 @@ class SampleParser(object):
new.concentration = sample['Unnamed: 3']
# logger.debug(f"Sample object: {new.sample_id} = {type(new.sample_id)}")
logger.debug(f"Got sample_id: {new.sample_id}")
# need to exclude empties and blanks
try:
not_a_nan = not np.isnan(new.sample_id) and str(new.sample_id).lower() != 'blank'
except TypeError:
@@ -140,10 +185,17 @@ class SampleParser(object):
def parse_wastewater_samples(self) -> list[WWSample]:
"""
construct wastewater specific sample objects
Returns:
list[WWSample]: list of sample objects
"""
new_list = []
for sample in self.samples:
new = WWSample()
new.ww_processing_num = sample['Unnamed: 2']
# need to ensure we have a sample id for database integrity
try:
not_a_nan = not np.isnan(sample['Unnamed: 3'])
except TypeError:
@@ -153,6 +205,7 @@ class SampleParser(object):
else:
new.ww_sample_full_id = uuid.uuid4().hex.upper()
new.rsl_number = sample['Unnamed: 9']
# need to ensure we get a collection date
try:
not_a_nan = not np.isnan(sample['Unnamed: 5'])
except TypeError:
@@ -169,11 +222,11 @@ class SampleParser(object):
return new_list
def string_formatter(input):
logger.debug(f"{input} : {type(input)}")
match input:
case int() | float() | np.float64:
return "{:0.0f}".format(input)
case _:
return input
# def string_formatter(input):
# logger.debug(f"{input} : {type(input)}")
# match input:
# case int() | float() | np.float64:
# return "{:0.0f}".format(input)
# case _:
# return input

View File

@@ -8,13 +8,22 @@ import logging
logger = logging.getLogger(f"submissions.{__name__}")
def make_report_xlsx(records:list[dict]) -> DataFrame:
"""
create the dataframe for a report
Args:
records (list[dict]): list of dictionaries created from submissions
Returns:
DataFrame: output dataframe
"""
df = DataFrame.from_records(records)
# put submissions with the same lab together
df = df.sort_values("Submitting Lab")
# table = df.pivot_table(values="Cost", index=["Submitting Lab", "Extraction Kit"], columns=["Cost", "Sample Count"], aggfunc={'Cost':np.sum,'Sample Count':np.sum})
# aggregate cost and sample count columns
df2 = df.groupby(["Submitting Lab", "Extraction Kit"]).agg({'Cost': ['sum', 'count'], 'Sample Count':['sum']})
# df2['Cost'] = df2['Cost'].map('${:,.2f}'.format)
logger.debug(df2.columns)
# df2['Cost']['sum'] = df2['Cost']['sum'].apply('${:,.2f}'.format)
# apply formating to cost column
df2.iloc[:, (df2.columns.get_level_values(1)=='sum') & (df2.columns.get_level_values(0)=='Cost')] = df2.iloc[:, (df2.columns.get_level_values(1)=='sum') & (df2.columns.get_level_values(0)=='Cost')].applymap('${:,.2f}'.format)
return df2
@@ -65,7 +74,18 @@ def make_report_xlsx(records:list[dict]) -> DataFrame:
# dfs['name'] = df
# return dfs
def convert_control_by_mode(ctx:dict, control:models.Control, mode:str):
def convert_control_by_mode(ctx:dict, control:models.Control, mode:str) -> list[dict]:
"""
split control object into analysis types
Args:
ctx (dict): settings passed from gui
control (models.Control): control to be parsed into list
mode (str): analysis type
Returns:
list[dict]: list of records
"""
output = []
data = json.loads(getattr(control, mode))
for genus in data:
@@ -82,6 +102,17 @@ def convert_control_by_mode(ctx:dict, control:models.Control, mode:str):
def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -> DataFrame:
"""
Convert list of control records to dataframe
Args:
ctx (dict): settings passed from gui
input (list[dict]): list of dictionaries containing records
subtype (str | None, optional): _description_. Defaults to None.
Returns:
DataFrame: _description_
"""
df = DataFrame.from_records(input)
safe = ['name', 'submitted_date', 'genus', 'target']
logger.debug(df)