documentation and converted to username based exclusion of adding new kits
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
|
||||
from pandas import DataFrame
|
||||
import re
|
||||
|
||||
@@ -6,14 +5,14 @@ import re
|
||||
|
||||
def get_unique_values_in_df_column(df: DataFrame, column_name: str) -> list:
|
||||
"""
|
||||
_summary_
|
||||
get all unique values in a dataframe column by name
|
||||
|
||||
Args:
|
||||
df (DataFrame): _description_
|
||||
column_name (str): _description_
|
||||
df (DataFrame): input dataframe
|
||||
column_name (str): name of column of interest
|
||||
|
||||
Returns:
|
||||
list: _description_
|
||||
list: sorted list of unique values
|
||||
"""
|
||||
return sorted(df[column_name].unique())
|
||||
|
||||
@@ -23,7 +22,7 @@ def drop_reruns_from_df(ctx:dict, df: DataFrame) -> DataFrame:
|
||||
Removes semi-duplicates from dataframe after finding sequencing repeats.
|
||||
|
||||
Args:
|
||||
settings (dict): settings passed down from click
|
||||
settings (dict): settings passed from gui
|
||||
df (DataFrame): initial dataframe
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -11,40 +11,68 @@ import uuid
|
||||
logger = logging.getLogger(f"submissions.{__name__}")
|
||||
|
||||
class SheetParser(object):
|
||||
|
||||
def __init__(self, filepath:Path|None = None, **kwargs):
|
||||
"""
|
||||
object to pull and contain data from excel file
|
||||
"""
|
||||
def __init__(self, filepath:Path|None = None, **kwargs) -> None:
|
||||
"""
|
||||
Args:
|
||||
filepath (Path | None, optional): file path to excel sheet. Defaults to None.
|
||||
"""
|
||||
logger.debug(f"Parsing {filepath.__str__()}")
|
||||
# set attributes based on kwargs from gui ctx
|
||||
for kwarg in kwargs:
|
||||
setattr(self, f"_{kwarg}", kwargs[kwarg])
|
||||
if filepath == None:
|
||||
logger.debug(f"No filepath.")
|
||||
logger.error(f"No filepath given.")
|
||||
self.xl = None
|
||||
else:
|
||||
|
||||
try:
|
||||
self.xl = pd.ExcelFile(filepath.__str__())
|
||||
except ValueError:
|
||||
except ValueError as e:
|
||||
logger.error(f"Incorrect value: {e}")
|
||||
self.xl = None
|
||||
self.sub = OrderedDict()
|
||||
self.sub['submission_type'] = self._type_decider()
|
||||
# make decision about type of sample we have
|
||||
self.sub['submission_type'] = self._type_decider()
|
||||
# select proper parser based on sample type
|
||||
parse_sub = getattr(self, f"_parse_{self.sub['submission_type'].lower()}")
|
||||
parse_sub()
|
||||
|
||||
def _type_decider(self):
|
||||
def _type_decider(self) -> str:
|
||||
"""
|
||||
makes decisions about submission type based on structure of excel file
|
||||
|
||||
Returns:
|
||||
str: submission type name
|
||||
"""
|
||||
try:
|
||||
for type in self._submission_types:
|
||||
if self.xl.sheet_names == self._submission_types[type]['excel_map']:
|
||||
return type.title()
|
||||
return "Unknown"
|
||||
except:
|
||||
except Exception as e:
|
||||
logger.warning(f"We were unable to parse the submission type due to: {e}")
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def _parse_unknown(self):
|
||||
def _parse_unknown(self) -> None:
|
||||
"""
|
||||
Dummy function to handle unknown excel structures
|
||||
"""
|
||||
self.sub = None
|
||||
|
||||
|
||||
def _parse_generic(self, sheet_name:str):
|
||||
def _parse_generic(self, sheet_name:str) -> pd.DataFrame:
|
||||
"""
|
||||
Pulls information common to all submission types and passes on dataframe
|
||||
|
||||
Args:
|
||||
sheet_name (str): name of excel worksheet to pull from
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: relevant dataframe from excel sheet
|
||||
"""
|
||||
submission_info = self.xl.parse(sheet_name=sheet_name, dtype=object)
|
||||
|
||||
self.sub['submitter_plate_num'] = submission_info.iloc[0][1] #if pd.isnull(submission_info.iloc[0][1]) else string_formatter(submission_info.iloc[0][1])
|
||||
@@ -57,7 +85,10 @@ class SheetParser(object):
|
||||
return submission_info
|
||||
|
||||
|
||||
def _parse_bacterial_culture(self):
|
||||
def _parse_bacterial_culture(self) -> None:
|
||||
"""
|
||||
pulls info specific to bacterial culture sample type
|
||||
"""
|
||||
submission_info = self._parse_generic("Sample List")
|
||||
# iloc is [row][column] and the first row is set as header row so -2
|
||||
tech = str(submission_info.iloc[11][1])
|
||||
@@ -68,7 +99,7 @@ class SheetParser(object):
|
||||
tech = ", ".join(tech_reg.findall(tech))
|
||||
self.sub['technician'] = tech
|
||||
# reagents
|
||||
|
||||
# must be prefixed with 'lot_' to be recognized by gui
|
||||
self.sub['lot_wash_1'] = submission_info.iloc[1][6] #if pd.isnull(submission_info.iloc[1][6]) else string_formatter(submission_info.iloc[1][6])
|
||||
self.sub['lot_wash_2'] = submission_info.iloc[2][6] #if pd.isnull(submission_info.iloc[2][6]) else string_formatter(submission_info.iloc[2][6])
|
||||
self.sub['lot_binding_buffer'] = submission_info.iloc[3][6] #if pd.isnull(submission_info.iloc[3][6]) else string_formatter(submission_info.iloc[3][6])
|
||||
@@ -79,13 +110,17 @@ class SheetParser(object):
|
||||
self.sub['lot_ethanol'] = submission_info.iloc[10][6] #if pd.isnull(submission_info.iloc[10][6]) else string_formatter(submission_info.iloc[10][6])
|
||||
self.sub['lot_positive_control'] = submission_info.iloc[103][1] #if pd.isnull(submission_info.iloc[103][1]) else string_formatter(submission_info.iloc[103][1])
|
||||
self.sub['lot_plate'] = submission_info.iloc[12][6] #if pd.isnull(submission_info.iloc[12][6]) else string_formatter(submission_info.iloc[12][6])
|
||||
# get individual sample info
|
||||
sample_parser = SampleParser(submission_info.iloc[15:111])
|
||||
sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
|
||||
logger.debug(f"Parser result: {self.sub}")
|
||||
self.sub['samples'] = sample_parse()
|
||||
|
||||
|
||||
def _parse_wastewater(self):
|
||||
def _parse_wastewater(self) -> None:
|
||||
"""
|
||||
pulls info specific to wastewater sample type
|
||||
"""
|
||||
# submission_info = self.xl.parse("WW Submissions (ENTER HERE)")
|
||||
submission_info = self._parse_generic("WW Submissions (ENTER HERE)")
|
||||
enrichment_info = self.xl.parse("Enrichment Worksheet", dtype=object)
|
||||
@@ -108,19 +143,28 @@ class SheetParser(object):
|
||||
self.sub['lot_pre_mix_2'] = qprc_info.iloc[2][14] #if pd.isnull(qprc_info.iloc[2][14]) else string_formatter(qprc_info.iloc[2][14])
|
||||
self.sub['lot_positive_control'] = qprc_info.iloc[3][14] #if pd.isnull(qprc_info.iloc[3][14]) else string_formatter(qprc_info.iloc[3][14])
|
||||
self.sub['lot_ddh2o'] = qprc_info.iloc[4][14] #if pd.isnull(qprc_info.iloc[4][14]) else string_formatter(qprc_info.iloc[4][14])
|
||||
# gt individual sample info
|
||||
sample_parser = SampleParser(submission_info.iloc[16:40])
|
||||
sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
|
||||
self.sub['samples'] = sample_parse()
|
||||
|
||||
|
||||
class SampleParser(object):
|
||||
|
||||
"""
|
||||
object to pull data for samples in excel sheet and construct individual sample objects
|
||||
"""
|
||||
|
||||
def __init__(self, df:pd.DataFrame) -> None:
|
||||
self.samples = df.to_dict("records")
|
||||
|
||||
|
||||
def parse_bacterial_culture_samples(self) -> list[BCSample]:
|
||||
"""
|
||||
construct bacterial culture specific sample objects
|
||||
|
||||
Returns:
|
||||
list[BCSample]: list of sample objects
|
||||
"""
|
||||
new_list = []
|
||||
for sample in self.samples:
|
||||
new = BCSample()
|
||||
@@ -130,6 +174,7 @@ class SampleParser(object):
|
||||
new.concentration = sample['Unnamed: 3']
|
||||
# logger.debug(f"Sample object: {new.sample_id} = {type(new.sample_id)}")
|
||||
logger.debug(f"Got sample_id: {new.sample_id}")
|
||||
# need to exclude empties and blanks
|
||||
try:
|
||||
not_a_nan = not np.isnan(new.sample_id) and str(new.sample_id).lower() != 'blank'
|
||||
except TypeError:
|
||||
@@ -140,10 +185,17 @@ class SampleParser(object):
|
||||
|
||||
|
||||
def parse_wastewater_samples(self) -> list[WWSample]:
|
||||
"""
|
||||
construct wastewater specific sample objects
|
||||
|
||||
Returns:
|
||||
list[WWSample]: list of sample objects
|
||||
"""
|
||||
new_list = []
|
||||
for sample in self.samples:
|
||||
new = WWSample()
|
||||
new.ww_processing_num = sample['Unnamed: 2']
|
||||
# need to ensure we have a sample id for database integrity
|
||||
try:
|
||||
not_a_nan = not np.isnan(sample['Unnamed: 3'])
|
||||
except TypeError:
|
||||
@@ -153,6 +205,7 @@ class SampleParser(object):
|
||||
else:
|
||||
new.ww_sample_full_id = uuid.uuid4().hex.upper()
|
||||
new.rsl_number = sample['Unnamed: 9']
|
||||
# need to ensure we get a collection date
|
||||
try:
|
||||
not_a_nan = not np.isnan(sample['Unnamed: 5'])
|
||||
except TypeError:
|
||||
@@ -169,11 +222,11 @@ class SampleParser(object):
|
||||
return new_list
|
||||
|
||||
|
||||
def string_formatter(input):
|
||||
logger.debug(f"{input} : {type(input)}")
|
||||
match input:
|
||||
case int() | float() | np.float64:
|
||||
return "{:0.0f}".format(input)
|
||||
case _:
|
||||
return input
|
||||
# def string_formatter(input):
|
||||
# logger.debug(f"{input} : {type(input)}")
|
||||
# match input:
|
||||
# case int() | float() | np.float64:
|
||||
# return "{:0.0f}".format(input)
|
||||
# case _:
|
||||
# return input
|
||||
|
||||
@@ -8,13 +8,22 @@ import logging
|
||||
logger = logging.getLogger(f"submissions.{__name__}")
|
||||
|
||||
def make_report_xlsx(records:list[dict]) -> DataFrame:
|
||||
"""
|
||||
create the dataframe for a report
|
||||
|
||||
Args:
|
||||
records (list[dict]): list of dictionaries created from submissions
|
||||
|
||||
Returns:
|
||||
DataFrame: output dataframe
|
||||
"""
|
||||
df = DataFrame.from_records(records)
|
||||
# put submissions with the same lab together
|
||||
df = df.sort_values("Submitting Lab")
|
||||
# table = df.pivot_table(values="Cost", index=["Submitting Lab", "Extraction Kit"], columns=["Cost", "Sample Count"], aggfunc={'Cost':np.sum,'Sample Count':np.sum})
|
||||
# aggregate cost and sample count columns
|
||||
df2 = df.groupby(["Submitting Lab", "Extraction Kit"]).agg({'Cost': ['sum', 'count'], 'Sample Count':['sum']})
|
||||
# df2['Cost'] = df2['Cost'].map('${:,.2f}'.format)
|
||||
logger.debug(df2.columns)
|
||||
# df2['Cost']['sum'] = df2['Cost']['sum'].apply('${:,.2f}'.format)
|
||||
# apply formating to cost column
|
||||
df2.iloc[:, (df2.columns.get_level_values(1)=='sum') & (df2.columns.get_level_values(0)=='Cost')] = df2.iloc[:, (df2.columns.get_level_values(1)=='sum') & (df2.columns.get_level_values(0)=='Cost')].applymap('${:,.2f}'.format)
|
||||
return df2
|
||||
|
||||
@@ -65,7 +74,18 @@ def make_report_xlsx(records:list[dict]) -> DataFrame:
|
||||
# dfs['name'] = df
|
||||
# return dfs
|
||||
|
||||
def convert_control_by_mode(ctx:dict, control:models.Control, mode:str):
|
||||
def convert_control_by_mode(ctx:dict, control:models.Control, mode:str) -> list[dict]:
|
||||
"""
|
||||
split control object into analysis types
|
||||
|
||||
Args:
|
||||
ctx (dict): settings passed from gui
|
||||
control (models.Control): control to be parsed into list
|
||||
mode (str): analysis type
|
||||
|
||||
Returns:
|
||||
list[dict]: list of records
|
||||
"""
|
||||
output = []
|
||||
data = json.loads(getattr(control, mode))
|
||||
for genus in data:
|
||||
@@ -82,6 +102,17 @@ def convert_control_by_mode(ctx:dict, control:models.Control, mode:str):
|
||||
|
||||
|
||||
def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -> DataFrame:
|
||||
"""
|
||||
Convert list of control records to dataframe
|
||||
|
||||
Args:
|
||||
ctx (dict): settings passed from gui
|
||||
input (list[dict]): list of dictionaries containing records
|
||||
subtype (str | None, optional): _description_. Defaults to None.
|
||||
|
||||
Returns:
|
||||
DataFrame: _description_
|
||||
"""
|
||||
df = DataFrame.from_records(input)
|
||||
safe = ['name', 'submitted_date', 'genus', 'target']
|
||||
logger.debug(df)
|
||||
|
||||
Reference in New Issue
Block a user