From dff5a5aa1efc7edcc12d909b96fdbda9cf20a4a9 Mon Sep 17 00:00:00 2001 From: Landon Wark Date: Thu, 27 Apr 2023 12:51:53 -0500 Subject: [PATCH] Updated parsers and chart constructors. --- CHANGELOG.md | 5 +++++ src/submissions/__init__.py | 2 +- src/submissions/backend/db/functions.py | 11 ++++++++--- src/submissions/backend/db/models/controls.py | 3 +++ src/submissions/backend/db/models/kits.py | 18 +++++++++++++++--- src/submissions/backend/excel/parser.py | 14 +++++++++----- src/submissions/backend/excel/reports.py | 8 +++++--- .../frontend/main_window_functions.py | 12 +++++++++--- .../frontend/visualizations/control_charts.py | 5 ++++- 9 files changed, 59 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2ca958..a547838 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 202304.04 + +- Kraken controls graph now only pulls top 20 results to prevent crashing. +- Improved cost calculations per column in a 96 well plate. + ## 202304.01 - Improved function results output to ui. diff --git a/src/submissions/__init__.py b/src/submissions/__init__.py index 301a5df..4354579 100644 --- a/src/submissions/__init__.py +++ b/src/submissions/__init__.py @@ -4,7 +4,7 @@ from pathlib import Path # Version of the realpython-reader package __project__ = "submissions" -__version__ = "202304.2b" +__version__ = "202304.4b" __author__ = {"name":"Landon Wark", "email":"Landon.Wark@phac-aspc.gc.ca"} __copyright__ = "2022-2023, Government of Canada" diff --git a/src/submissions/backend/db/functions.py b/src/submissions/backend/db/functions.py index 54fc09e..ac8b8c2 100644 --- a/src/submissions/backend/db/functions.py +++ b/src/submissions/backend/db/functions.py @@ -20,6 +20,7 @@ from getpass import getuser import numpy as np import yaml from pathlib import Path +from math import ceil logger = logging.getLogger(f"submissions.{__name__}") @@ -161,9 +162,13 @@ def construct_submission_info(ctx:dict, info_dict:dict) -> models.BasicSubmissio # calculate cost of the run: immutable cost + mutable times number of columns # This is now attached to submission upon creation to preserve at-run costs incase of cost increase in the future. try: - instance.run_cost = instance.extraction_kit.immutable_cost + (instance.extraction_kit.mutable_cost * ((instance.sample_count / 8)/12)) - except (TypeError, AttributeError): - logger.debug(f"Looks like that kit doesn't have cost breakdown yet, using full plate cost.") + # ceil(instance.sample_count / 8) will get number of columns + # the cost of a full run multiplied by (that number / 12) is x twelfths the cost of a full run + logger.debug(f"Instance extraction kit details: {instance.extraction_kit.__dict__}") + cols_count = ceil(int(instance.sample_count) / 8) + instance.run_cost = instance.extraction_kit.constant_cost + (instance.extraction_kit.mutable_cost * (cols_count / 12)) + except (TypeError, AttributeError) as e: + logger.debug(f"Looks like that kit doesn't have cost breakdown yet due to: {e}, using full plate cost.") instance.run_cost = instance.extraction_kit.cost_per_run # We need to make sure there's a proper rsl plate number try: diff --git a/src/submissions/backend/db/models/controls.py b/src/submissions/backend/db/models/controls.py index 3dfd7aa..b483523 100644 --- a/src/submissions/backend/db/models/controls.py +++ b/src/submissions/backend/db/models/controls.py @@ -105,6 +105,9 @@ class Control(Base): for key in data[genus]: _dict[key] = data[genus][key] output.append(_dict) + # Have to triage kraken data to keep program from getting overwhelmed + if "kraken" in mode: + output = sorted(output, key=lambda d: d[f"{mode}_count"], reverse=True)[:49] return output def create_dummy_data(self, mode:str) -> dict: diff --git a/src/submissions/backend/db/models/kits.py b/src/submissions/backend/db/models/kits.py index 8e51087..56bd1d6 100644 --- a/src/submissions/backend/db/models/kits.py +++ b/src/submissions/backend/db/models/kits.py @@ -25,8 +25,8 @@ class KitType(Base): submissions = relationship("BasicSubmission", back_populates="extraction_kit") #: submissions this kit was used for used_for = Column(JSON) #: list of names of sample types this kit can process cost_per_run = Column(FLOAT(2)) #: dollar amount for each full run of this kit NOTE: depreciated, use the constant and mutable costs instead - mutable_cost = Column(FLOAT(2)) #: dollar amount that can change with number of columns (reagents, tips, etc) - constant_cost = Column(FLOAT(2)) #: dollar amount that will remain constant (plates, man hours, etc) + mutable_cost = Column(FLOAT(2)) #: dollar amount per plate that can change with number of columns (reagents, tips, etc) + constant_cost = Column(FLOAT(2)) #: dollar amount per plate that will remain constant (plates, man hours, etc) reagent_types = relationship("ReagentType", back_populates="kits", uselist=True, secondary=reagenttypes_kittypes) #: reagent types this kit contains reagent_types_id = Column(INTEGER, ForeignKey("_reagent_types.id", ondelete='SET NULL', use_alter=True, name="fk_KT_reagentstype_id")) #: joined reagent type id @@ -110,4 +110,16 @@ class Reagent(Base): "type": type, "lot": self.lot, "expiry": place_holder.strftime("%Y-%m-%d") - } \ No newline at end of file + } + + +# class Discounts(Base): +# """ +# Relationship table for client labs for certain kits. +# """ +# __tablename__ = "_discounts" + +# id = Column(INTEGER, primary_key=True) #: primary key +# kit = relationship("KitType") #: joined parent reagent type +# kit_id = Column(INTEGER, ForeignKey("_kits.id", ondelete='SET NULL', name="fk_kit_type_id")) +# client = relationship("Organization") \ No newline at end of file diff --git a/src/submissions/backend/excel/parser.py b/src/submissions/backend/excel/parser.py index 0efae25..07c925d 100644 --- a/src/submissions/backend/excel/parser.py +++ b/src/submissions/backend/excel/parser.py @@ -107,8 +107,8 @@ class SheetParser(object): """ for ii, row in df.iterrows(): # skip positive control - if ii == 11: - continue + # if ii == 12: + # continue logger.debug(f"Running reagent parse for {row[1]} with type {type(row[1])} and value: {row[2]} with type {type(row[2])}") if not isinstance(row[2], float) and check_not_nan(row[1]): # must be prefixed with 'lot_' to be recognized by gui @@ -117,7 +117,10 @@ class SheetParser(object): except AttributeError: pass if reagent_type == "//": - reagent_type = row[0].replace(' ', '_').lower().strip() + if check_not_nan(row[2]): + reagent_type = row[0].replace(' ', '_').lower().strip() + else: + continue try: output_var = row[2].upper() except AttributeError: @@ -142,10 +145,11 @@ class SheetParser(object): # reagents # must be prefixed with 'lot_' to be recognized by gui # Todo: find a more adaptable way to read reagents. - reagent_range = submission_info.iloc[1:13, 4:8] + reagent_range = submission_info.iloc[1:14, 4:8] + logger.debug(reagent_range) parse_reagents(reagent_range) # get individual sample info - sample_parser = SampleParser(submission_info.iloc[15:111]) + sample_parser = SampleParser(submission_info.iloc[16:112]) sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples") logger.debug(f"Parser result: {self.sub}") self.sub['samples'] = sample_parse() diff --git a/src/submissions/backend/excel/reports.py b/src/submissions/backend/excel/reports.py index 4b3c52b..ea9dab0 100644 --- a/src/submissions/backend/excel/reports.py +++ b/src/submissions/backend/excel/reports.py @@ -9,6 +9,7 @@ import sys from pathlib import Path import re from tools import check_if_app +import asyncio logger = logging.getLogger(f"submissions.{__name__}") @@ -109,9 +110,10 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) - if column not in safe: if subtype != None and column != subtype: del df[column] + # logger.debug(df) # move date of sample submitted on same date as previous ahead one. df = displace_date(df) - df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl") + # df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl") # ad hoc method to make data labels more accurate. df = df_column_renamer(df=df) return df @@ -131,8 +133,8 @@ def df_column_renamer(df:DataFrame) -> DataFrame: return df.rename(columns = { "contains_ratio":"contains_shared_hashes_ratio", "matches_ratio":"matches_shared_hashes_ratio", - "kraken_count":"kraken2_read_count", - "kraken_percent":"kraken2_read_percent" + "kraken_count":"kraken2_read_count_(top_20)", + "kraken_percent":"kraken2_read_percent_(top_20)" }) diff --git a/src/submissions/frontend/main_window_functions.py b/src/submissions/frontend/main_window_functions.py index 27ea09a..4fc2095 100644 --- a/src/submissions/frontend/main_window_functions.py +++ b/src/submissions/frontend/main_window_functions.py @@ -35,6 +35,7 @@ from .custom_widgets.pop_ups import AlertPop, QuestionAsker from .custom_widgets import ReportDatePicker, ReagentTypeForm from .custom_widgets.misc import ImportReagent from .visualizations.control_charts import create_charts, construct_html +import asyncio logger = logging.getLogger(f"submissions.{__name__}") @@ -111,11 +112,14 @@ def import_submission_function(obj:QMainWindow) -> Tuple[QMainWindow, dict|None] add_widget = QComboBox() # lookup existing kits by 'submission_type' decided on by sheetparser uses = [item.__str__() for item in lookup_kittype_by_use(ctx=obj.ctx, used_by=prsr.sub['submission_type'])] - add_widget.addItems(uses) if check_not_nan(prsr.sub[item]): + logger.debug(f"The extraction kit in parser was: {prsr.sub[item]}") + uses.insert(0, uses.pop(uses.index(prsr.sub[item]))) obj.ext_kit = prsr.sub[item] else: - obj.ext_kit = add_widget.currentText() + logger.error(f"Couldn't find prsr.sub[extraction_kit]") + obj.ext_kit = uses[0] + add_widget.addItems(uses) case 'submitted_date': # create label obj.table_widget.formlayout.addWidget(QLabel(item.replace("_", " ").title())) @@ -265,7 +269,7 @@ def submit_new_sample_function(obj:QMainWindow) -> QMainWindow: # reset form for item in obj.table_widget.formlayout.parentWidget().findChildren(QWidget): item.setParent(None) - print(dir(obj)) + # print(dir(obj)) if hasattr(obj, 'csv'): dlg = QuestionAsker("Export CSV?", "Would you like to export the csv file?") if dlg.exec(): @@ -426,6 +430,8 @@ def chart_maker_function(obj:QMainWindow) -> QMainWindow: # flatten data to one dimensional list data = [item for sublist in data for item in sublist] logger.debug(f"Control objects going into df conversion: {data}") + if data == []: + return obj, dict(status="Critical", message="No data found for controls in given date range.") # send to dataframe creator df = convert_data_list_to_df(ctx=obj.ctx, input=data, subtype=obj.subtype) if obj.subtype == None: diff --git a/src/submissions/frontend/visualizations/control_charts.py b/src/submissions/frontend/visualizations/control_charts.py index a70f1ce..a1db0ee 100644 --- a/src/submissions/frontend/visualizations/control_charts.py +++ b/src/submissions/frontend/visualizations/control_charts.py @@ -39,7 +39,7 @@ def create_charts(ctx:dict, df:pd.DataFrame, ytitle:str|None=None) -> Figure: genera.append("") df['genus'] = df['genus'].replace({'\*':''}, regex=True).replace({"NaN":"Unknown"}) df['genera'] = genera - df = df.dropna() + # df = df.dropna() # remove original runs, using reruns if applicable df = drop_reruns_from_df(ctx=ctx, df=df) # sort by and exclude from @@ -49,6 +49,7 @@ def create_charts(ctx:dict, df:pd.DataFrame, ytitle:str|None=None) -> Figure: # Set descending for any columns that have "{mode}" in the header. ascending = [False if item == "target" else True for item in sorts] df = df.sort_values(by=sorts, ascending=ascending) + logger.debug(df[df.isna().any(axis=1)]) # actual chart construction is done by fig = construct_chart(ctx=ctx, df=df, modes=modes, ytitle=ytitle) return fig @@ -245,6 +246,8 @@ def construct_kraken_chart(settings:dict, df:pd.DataFrame, group_name:str, mode: Figure: initial figure with traces for modes """ df[f'{mode}_count'] = pd.to_numeric(df[f'{mode}_count'],errors='coerce') + df = df.groupby('submitted_date')[f'{mode}_count'].nlargest(2) + # The actual percentage from kraken was off due to exclusion of NaN, recalculating. df[f'{mode}_percent'] = 100 * df[f'{mode}_count'] / df.groupby('submitted_date')[f'{mode}_count'].transform('sum') modes = settings['modes'][mode]