Updated parsers and chart constructors.

This commit is contained in:
Landon Wark
2023-04-27 12:51:53 -05:00
parent 8a0a9aa69c
commit dff5a5aa1e
9 changed files with 59 additions and 19 deletions

View File

@@ -1,3 +1,8 @@
## 202304.04
- Kraken controls graph now only pulls top 20 results to prevent crashing.
- Improved cost calculations per column in a 96 well plate.
## 202304.01 ## 202304.01
- Improved function results output to ui. - Improved function results output to ui.

View File

@@ -4,7 +4,7 @@ from pathlib import Path
# Version of the realpython-reader package # Version of the realpython-reader package
__project__ = "submissions" __project__ = "submissions"
__version__ = "202304.2b" __version__ = "202304.4b"
__author__ = {"name":"Landon Wark", "email":"Landon.Wark@phac-aspc.gc.ca"} __author__ = {"name":"Landon Wark", "email":"Landon.Wark@phac-aspc.gc.ca"}
__copyright__ = "2022-2023, Government of Canada" __copyright__ = "2022-2023, Government of Canada"

View File

@@ -20,6 +20,7 @@ from getpass import getuser
import numpy as np import numpy as np
import yaml import yaml
from pathlib import Path from pathlib import Path
from math import ceil
logger = logging.getLogger(f"submissions.{__name__}") logger = logging.getLogger(f"submissions.{__name__}")
@@ -161,9 +162,13 @@ def construct_submission_info(ctx:dict, info_dict:dict) -> models.BasicSubmissio
# calculate cost of the run: immutable cost + mutable times number of columns # calculate cost of the run: immutable cost + mutable times number of columns
# This is now attached to submission upon creation to preserve at-run costs incase of cost increase in the future. # This is now attached to submission upon creation to preserve at-run costs incase of cost increase in the future.
try: try:
instance.run_cost = instance.extraction_kit.immutable_cost + (instance.extraction_kit.mutable_cost * ((instance.sample_count / 8)/12)) # ceil(instance.sample_count / 8) will get number of columns
except (TypeError, AttributeError): # the cost of a full run multiplied by (that number / 12) is x twelfths the cost of a full run
logger.debug(f"Looks like that kit doesn't have cost breakdown yet, using full plate cost.") logger.debug(f"Instance extraction kit details: {instance.extraction_kit.__dict__}")
cols_count = ceil(int(instance.sample_count) / 8)
instance.run_cost = instance.extraction_kit.constant_cost + (instance.extraction_kit.mutable_cost * (cols_count / 12))
except (TypeError, AttributeError) as e:
logger.debug(f"Looks like that kit doesn't have cost breakdown yet due to: {e}, using full plate cost.")
instance.run_cost = instance.extraction_kit.cost_per_run instance.run_cost = instance.extraction_kit.cost_per_run
# We need to make sure there's a proper rsl plate number # We need to make sure there's a proper rsl plate number
try: try:

View File

@@ -105,6 +105,9 @@ class Control(Base):
for key in data[genus]: for key in data[genus]:
_dict[key] = data[genus][key] _dict[key] = data[genus][key]
output.append(_dict) output.append(_dict)
# Have to triage kraken data to keep program from getting overwhelmed
if "kraken" in mode:
output = sorted(output, key=lambda d: d[f"{mode}_count"], reverse=True)[:49]
return output return output
def create_dummy_data(self, mode:str) -> dict: def create_dummy_data(self, mode:str) -> dict:

View File

@@ -25,8 +25,8 @@ class KitType(Base):
submissions = relationship("BasicSubmission", back_populates="extraction_kit") #: submissions this kit was used for submissions = relationship("BasicSubmission", back_populates="extraction_kit") #: submissions this kit was used for
used_for = Column(JSON) #: list of names of sample types this kit can process used_for = Column(JSON) #: list of names of sample types this kit can process
cost_per_run = Column(FLOAT(2)) #: dollar amount for each full run of this kit NOTE: depreciated, use the constant and mutable costs instead cost_per_run = Column(FLOAT(2)) #: dollar amount for each full run of this kit NOTE: depreciated, use the constant and mutable costs instead
mutable_cost = Column(FLOAT(2)) #: dollar amount that can change with number of columns (reagents, tips, etc) mutable_cost = Column(FLOAT(2)) #: dollar amount per plate that can change with number of columns (reagents, tips, etc)
constant_cost = Column(FLOAT(2)) #: dollar amount that will remain constant (plates, man hours, etc) constant_cost = Column(FLOAT(2)) #: dollar amount per plate that will remain constant (plates, man hours, etc)
reagent_types = relationship("ReagentType", back_populates="kits", uselist=True, secondary=reagenttypes_kittypes) #: reagent types this kit contains reagent_types = relationship("ReagentType", back_populates="kits", uselist=True, secondary=reagenttypes_kittypes) #: reagent types this kit contains
reagent_types_id = Column(INTEGER, ForeignKey("_reagent_types.id", ondelete='SET NULL', use_alter=True, name="fk_KT_reagentstype_id")) #: joined reagent type id reagent_types_id = Column(INTEGER, ForeignKey("_reagent_types.id", ondelete='SET NULL', use_alter=True, name="fk_KT_reagentstype_id")) #: joined reagent type id
@@ -111,3 +111,15 @@ class Reagent(Base):
"lot": self.lot, "lot": self.lot,
"expiry": place_holder.strftime("%Y-%m-%d") "expiry": place_holder.strftime("%Y-%m-%d")
} }
# class Discounts(Base):
# """
# Relationship table for client labs for certain kits.
# """
# __tablename__ = "_discounts"
# id = Column(INTEGER, primary_key=True) #: primary key
# kit = relationship("KitType") #: joined parent reagent type
# kit_id = Column(INTEGER, ForeignKey("_kits.id", ondelete='SET NULL', name="fk_kit_type_id"))
# client = relationship("Organization")

View File

@@ -107,8 +107,8 @@ class SheetParser(object):
""" """
for ii, row in df.iterrows(): for ii, row in df.iterrows():
# skip positive control # skip positive control
if ii == 11: # if ii == 12:
continue # continue
logger.debug(f"Running reagent parse for {row[1]} with type {type(row[1])} and value: {row[2]} with type {type(row[2])}") logger.debug(f"Running reagent parse for {row[1]} with type {type(row[1])} and value: {row[2]} with type {type(row[2])}")
if not isinstance(row[2], float) and check_not_nan(row[1]): if not isinstance(row[2], float) and check_not_nan(row[1]):
# must be prefixed with 'lot_' to be recognized by gui # must be prefixed with 'lot_' to be recognized by gui
@@ -117,7 +117,10 @@ class SheetParser(object):
except AttributeError: except AttributeError:
pass pass
if reagent_type == "//": if reagent_type == "//":
if check_not_nan(row[2]):
reagent_type = row[0].replace(' ', '_').lower().strip() reagent_type = row[0].replace(' ', '_').lower().strip()
else:
continue
try: try:
output_var = row[2].upper() output_var = row[2].upper()
except AttributeError: except AttributeError:
@@ -142,10 +145,11 @@ class SheetParser(object):
# reagents # reagents
# must be prefixed with 'lot_' to be recognized by gui # must be prefixed with 'lot_' to be recognized by gui
# Todo: find a more adaptable way to read reagents. # Todo: find a more adaptable way to read reagents.
reagent_range = submission_info.iloc[1:13, 4:8] reagent_range = submission_info.iloc[1:14, 4:8]
logger.debug(reagent_range)
parse_reagents(reagent_range) parse_reagents(reagent_range)
# get individual sample info # get individual sample info
sample_parser = SampleParser(submission_info.iloc[15:111]) sample_parser = SampleParser(submission_info.iloc[16:112])
sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples") sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
logger.debug(f"Parser result: {self.sub}") logger.debug(f"Parser result: {self.sub}")
self.sub['samples'] = sample_parse() self.sub['samples'] = sample_parse()

View File

@@ -9,6 +9,7 @@ import sys
from pathlib import Path from pathlib import Path
import re import re
from tools import check_if_app from tools import check_if_app
import asyncio
logger = logging.getLogger(f"submissions.{__name__}") logger = logging.getLogger(f"submissions.{__name__}")
@@ -109,9 +110,10 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -
if column not in safe: if column not in safe:
if subtype != None and column != subtype: if subtype != None and column != subtype:
del df[column] del df[column]
# logger.debug(df)
# move date of sample submitted on same date as previous ahead one. # move date of sample submitted on same date as previous ahead one.
df = displace_date(df) df = displace_date(df)
df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl") # df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl")
# ad hoc method to make data labels more accurate. # ad hoc method to make data labels more accurate.
df = df_column_renamer(df=df) df = df_column_renamer(df=df)
return df return df
@@ -131,8 +133,8 @@ def df_column_renamer(df:DataFrame) -> DataFrame:
return df.rename(columns = { return df.rename(columns = {
"contains_ratio":"contains_shared_hashes_ratio", "contains_ratio":"contains_shared_hashes_ratio",
"matches_ratio":"matches_shared_hashes_ratio", "matches_ratio":"matches_shared_hashes_ratio",
"kraken_count":"kraken2_read_count", "kraken_count":"kraken2_read_count_(top_20)",
"kraken_percent":"kraken2_read_percent" "kraken_percent":"kraken2_read_percent_(top_20)"
}) })

View File

@@ -35,6 +35,7 @@ from .custom_widgets.pop_ups import AlertPop, QuestionAsker
from .custom_widgets import ReportDatePicker, ReagentTypeForm from .custom_widgets import ReportDatePicker, ReagentTypeForm
from .custom_widgets.misc import ImportReagent from .custom_widgets.misc import ImportReagent
from .visualizations.control_charts import create_charts, construct_html from .visualizations.control_charts import create_charts, construct_html
import asyncio
logger = logging.getLogger(f"submissions.{__name__}") logger = logging.getLogger(f"submissions.{__name__}")
@@ -111,11 +112,14 @@ def import_submission_function(obj:QMainWindow) -> Tuple[QMainWindow, dict|None]
add_widget = QComboBox() add_widget = QComboBox()
# lookup existing kits by 'submission_type' decided on by sheetparser # lookup existing kits by 'submission_type' decided on by sheetparser
uses = [item.__str__() for item in lookup_kittype_by_use(ctx=obj.ctx, used_by=prsr.sub['submission_type'])] uses = [item.__str__() for item in lookup_kittype_by_use(ctx=obj.ctx, used_by=prsr.sub['submission_type'])]
add_widget.addItems(uses)
if check_not_nan(prsr.sub[item]): if check_not_nan(prsr.sub[item]):
logger.debug(f"The extraction kit in parser was: {prsr.sub[item]}")
uses.insert(0, uses.pop(uses.index(prsr.sub[item])))
obj.ext_kit = prsr.sub[item] obj.ext_kit = prsr.sub[item]
else: else:
obj.ext_kit = add_widget.currentText() logger.error(f"Couldn't find prsr.sub[extraction_kit]")
obj.ext_kit = uses[0]
add_widget.addItems(uses)
case 'submitted_date': case 'submitted_date':
# create label # create label
obj.table_widget.formlayout.addWidget(QLabel(item.replace("_", " ").title())) obj.table_widget.formlayout.addWidget(QLabel(item.replace("_", " ").title()))
@@ -265,7 +269,7 @@ def submit_new_sample_function(obj:QMainWindow) -> QMainWindow:
# reset form # reset form
for item in obj.table_widget.formlayout.parentWidget().findChildren(QWidget): for item in obj.table_widget.formlayout.parentWidget().findChildren(QWidget):
item.setParent(None) item.setParent(None)
print(dir(obj)) # print(dir(obj))
if hasattr(obj, 'csv'): if hasattr(obj, 'csv'):
dlg = QuestionAsker("Export CSV?", "Would you like to export the csv file?") dlg = QuestionAsker("Export CSV?", "Would you like to export the csv file?")
if dlg.exec(): if dlg.exec():
@@ -426,6 +430,8 @@ def chart_maker_function(obj:QMainWindow) -> QMainWindow:
# flatten data to one dimensional list # flatten data to one dimensional list
data = [item for sublist in data for item in sublist] data = [item for sublist in data for item in sublist]
logger.debug(f"Control objects going into df conversion: {data}") logger.debug(f"Control objects going into df conversion: {data}")
if data == []:
return obj, dict(status="Critical", message="No data found for controls in given date range.")
# send to dataframe creator # send to dataframe creator
df = convert_data_list_to_df(ctx=obj.ctx, input=data, subtype=obj.subtype) df = convert_data_list_to_df(ctx=obj.ctx, input=data, subtype=obj.subtype)
if obj.subtype == None: if obj.subtype == None:

View File

@@ -39,7 +39,7 @@ def create_charts(ctx:dict, df:pd.DataFrame, ytitle:str|None=None) -> Figure:
genera.append("") genera.append("")
df['genus'] = df['genus'].replace({'\*':''}, regex=True).replace({"NaN":"Unknown"}) df['genus'] = df['genus'].replace({'\*':''}, regex=True).replace({"NaN":"Unknown"})
df['genera'] = genera df['genera'] = genera
df = df.dropna() # df = df.dropna()
# remove original runs, using reruns if applicable # remove original runs, using reruns if applicable
df = drop_reruns_from_df(ctx=ctx, df=df) df = drop_reruns_from_df(ctx=ctx, df=df)
# sort by and exclude from # sort by and exclude from
@@ -49,6 +49,7 @@ def create_charts(ctx:dict, df:pd.DataFrame, ytitle:str|None=None) -> Figure:
# Set descending for any columns that have "{mode}" in the header. # Set descending for any columns that have "{mode}" in the header.
ascending = [False if item == "target" else True for item in sorts] ascending = [False if item == "target" else True for item in sorts]
df = df.sort_values(by=sorts, ascending=ascending) df = df.sort_values(by=sorts, ascending=ascending)
logger.debug(df[df.isna().any(axis=1)])
# actual chart construction is done by # actual chart construction is done by
fig = construct_chart(ctx=ctx, df=df, modes=modes, ytitle=ytitle) fig = construct_chart(ctx=ctx, df=df, modes=modes, ytitle=ytitle)
return fig return fig
@@ -245,6 +246,8 @@ def construct_kraken_chart(settings:dict, df:pd.DataFrame, group_name:str, mode:
Figure: initial figure with traces for modes Figure: initial figure with traces for modes
""" """
df[f'{mode}_count'] = pd.to_numeric(df[f'{mode}_count'],errors='coerce') df[f'{mode}_count'] = pd.to_numeric(df[f'{mode}_count'],errors='coerce')
df = df.groupby('submitted_date')[f'{mode}_count'].nlargest(2)
# The actual percentage from kraken was off due to exclusion of NaN, recalculating. # The actual percentage from kraken was off due to exclusion of NaN, recalculating.
df[f'{mode}_percent'] = 100 * df[f'{mode}_count'] / df.groupby('submitted_date')[f'{mode}_count'].transform('sum') df[f'{mode}_percent'] = 100 * df[f'{mode}_count'] / df.groupby('submitted_date')[f'{mode}_count'].transform('sum')
modes = settings['modes'][mode] modes = settings['modes'][mode]