Updated parsers and chart constructors.

2023-04-27 12:51:53 -05:00
parent 8a0a9aa69c
commit dff5a5aa1e
9 changed files with 59 additions and 19 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
 ## 202304.04
 - Kraken controls graph now only pulls top 20 results to prevent crashing.
 - Improved cost calculations per column in a 96 well plate.
 ## 202304.01
 - Improved function results output to ui.
--- a/src/submissions/init.py
+++ b/src/submissions/init.py
@@ -4,7 +4,7 @@ from pathlib import Path
 # Version of the realpython-reader package
 __project__ = "submissions"
-__version__ = "202304.2b"
+__version__ = "202304.4b"
 __author__ = {"name":"Landon Wark", "email":"Landon.Wark@phac-aspc.gc.ca"}
 __copyright__ = "2022-2023, Government of Canada"
--- a/src/submissions/backend/db/functions.py
+++ b/src/submissions/backend/db/functions.py
@@ -20,6 +20,7 @@ from getpass import getuser
 import numpy as np
 import yaml
 from pathlib import Path
 from math import ceil
 logger = logging.getLogger(f"submissions.{__name__}")
@@ -161,9 +162,13 @@ def construct_submission_info(ctx:dict, info_dict:dict) -> models.BasicSubmissio
    # calculate cost of the run: immutable cost + mutable times number of columns
    # This is now attached to submission upon creation to preserve at-run costs incase of cost increase in the future.
    try:
-        instance.run_cost = instance.extraction_kit.immutable_cost + (instance.extraction_kit.mutable_cost * ((instance.sample_count / 8)/12))
+        # ceil(instance.sample_count / 8) will get number of columns
-    except (TypeError, AttributeError):
+        # the cost of a full run multiplied by (that number / 12) is x twelfths the cost of a full run
-        logger.debug(f"Looks like that kit doesn't have cost breakdown yet, using full plate cost.")
+        logger.debug(f"Instance extraction kit details: {instance.extraction_kit.__dict__}")
        cols_count = ceil(int(instance.sample_count) / 8)
        instance.run_cost = instance.extraction_kit.constant_cost + (instance.extraction_kit.mutable_cost * (cols_count / 12))
    except (TypeError, AttributeError) as e:
        logger.debug(f"Looks like that kit doesn't have cost breakdown yet due to: {e}, using full plate cost.")
        instance.run_cost = instance.extraction_kit.cost_per_run
    # We need to make sure there's a proper rsl plate number
    try:
--- a/src/submissions/backend/db/models/controls.py
+++ b/src/submissions/backend/db/models/controls.py
@@ -105,6 +105,9 @@ class Control(Base):
            for key in data[genus]:
                _dict[key] = data[genus][key]
            output.append(_dict)
        # Have to triage kraken data to keep program from getting overwhelmed
        if "kraken" in mode:
            output = sorted(output, key=lambda d: d[f"{mode}_count"], reverse=True)[:49]
        return output
    def create_dummy_data(self, mode:str) -> dict:
--- a/src/submissions/backend/db/models/kits.py
+++ b/src/submissions/backend/db/models/kits.py
@@ -25,8 +25,8 @@ class KitType(Base):
    submissions = relationship("BasicSubmission", back_populates="extraction_kit") #: submissions this kit was used for
    used_for = Column(JSON) #: list of names of sample types this kit can process
    cost_per_run = Column(FLOAT(2)) #: dollar amount for each full run of this kit NOTE: depreciated, use the constant and mutable costs instead
-    mutable_cost = Column(FLOAT(2)) #: dollar amount that can change with number of columns (reagents, tips, etc)
+    mutable_cost = Column(FLOAT(2)) #: dollar amount per plate that can change with number of columns (reagents, tips, etc)
-    constant_cost = Column(FLOAT(2)) #: dollar amount that will remain constant (plates, man hours, etc)
+    constant_cost = Column(FLOAT(2)) #: dollar amount per plate that will remain constant (plates, man hours, etc)
    reagent_types = relationship("ReagentType", back_populates="kits", uselist=True, secondary=reagenttypes_kittypes) #: reagent types this kit contains
    reagent_types_id = Column(INTEGER, ForeignKey("_reagent_types.id", ondelete='SET NULL', use_alter=True, name="fk_KT_reagentstype_id")) #: joined reagent type id
@@ -111,3 +111,15 @@ class Reagent(Base):
            "lot": self.lot,
            "expiry": place_holder.strftime("%Y-%m-%d")
        }
 # class Discounts(Base):
 #     """
 #     Relationship table for client labs for certain kits.
 #     """
 #     __tablename__ = "_discounts"
 #     id = Column(INTEGER, primary_key=True) #: primary key
 #     kit = relationship("KitType") #: joined parent reagent type
 #     kit_id = Column(INTEGER, ForeignKey("_kits.id", ondelete='SET NULL', name="fk_kit_type_id"))
 #     client = relationship("Organization")
--- a/src/submissions/backend/excel/parser.py
+++ b/src/submissions/backend/excel/parser.py
@@ -107,8 +107,8 @@ class SheetParser(object):
            """            
            for ii, row in df.iterrows():
                # skip positive control
-                if ii == 11:
+                # if ii == 12:
-                    continue
+                #     continue
                logger.debug(f"Running reagent parse for {row[1]} with type {type(row[1])} and value: {row[2]} with type {type(row[2])}")
                if not isinstance(row[2], float) and check_not_nan(row[1]):
                    # must be prefixed with 'lot_' to be recognized by gui
@@ -117,7 +117,10 @@ class SheetParser(object):
                    except AttributeError:
                        pass
                    if reagent_type == "//":
                        if check_not_nan(row[2]):
                            reagent_type = row[0].replace(' ', '_').lower().strip()
                        else:
                            continue
                    try:
                        output_var = row[2].upper()
                    except AttributeError:
@@ -142,10 +145,11 @@ class SheetParser(object):
        # reagents
        # must be prefixed with 'lot_' to be recognized by gui
        # Todo: find a more adaptable way to read reagents.
-        reagent_range = submission_info.iloc[1:13, 4:8]
+        reagent_range = submission_info.iloc[1:14, 4:8]
        logger.debug(reagent_range)
        parse_reagents(reagent_range)
        # get individual sample info
-        sample_parser = SampleParser(submission_info.iloc[15:111])
+        sample_parser = SampleParser(submission_info.iloc[16:112])
        sample_parse = getattr(sample_parser, f"parse_{self.sub['submission_type'].lower()}_samples")
        logger.debug(f"Parser result: {self.sub}")
        self.sub['samples'] = sample_parse()
--- a/src/submissions/backend/excel/reports.py
+++ b/src/submissions/backend/excel/reports.py
@@ -9,6 +9,7 @@ import sys
 from pathlib import Path
 import re
 from tools import check_if_app
 import asyncio
 logger = logging.getLogger(f"submissions.{__name__}")
@@ -109,9 +110,10 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -
        if column not in safe:
            if subtype != None and column != subtype:
                del df[column]
    # logger.debug(df)
    # move date of sample submitted on same date as previous ahead one.
    df = displace_date(df)
-    df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl")
+    # df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl")
    # ad hoc method to make data labels more accurate.
    df = df_column_renamer(df=df)
    return df
@@ -131,8 +133,8 @@ def df_column_renamer(df:DataFrame) -> DataFrame:
    return df.rename(columns = {
        "contains_ratio":"contains_shared_hashes_ratio",
        "matches_ratio":"matches_shared_hashes_ratio",
-        "kraken_count":"kraken2_read_count",
+        "kraken_count":"kraken2_read_count_(top_20)",
-        "kraken_percent":"kraken2_read_percent"
+        "kraken_percent":"kraken2_read_percent_(top_20)"
    })
--- a/src/submissions/frontend/main_window_functions.py
+++ b/src/submissions/frontend/main_window_functions.py
@@ -35,6 +35,7 @@ from .custom_widgets.pop_ups import AlertPop, QuestionAsker
 from .custom_widgets import ReportDatePicker, ReagentTypeForm
 from .custom_widgets.misc import ImportReagent
 from .visualizations.control_charts import create_charts, construct_html
 import asyncio
 logger = logging.getLogger(f"submissions.{__name__}")
@@ -111,11 +112,14 @@ def import_submission_function(obj:QMainWindow) -> Tuple[QMainWindow, dict|None]
                add_widget = QComboBox()
                # lookup existing kits by 'submission_type' decided on by sheetparser
                uses = [item.__str__() for item in lookup_kittype_by_use(ctx=obj.ctx, used_by=prsr.sub['submission_type'])]
                add_widget.addItems(uses)
                if check_not_nan(prsr.sub[item]):
                    logger.debug(f"The extraction kit in parser was: {prsr.sub[item]}")
                    uses.insert(0, uses.pop(uses.index(prsr.sub[item])))
                    obj.ext_kit = prsr.sub[item]
                else:
-                    obj.ext_kit = add_widget.currentText()
+                    logger.error(f"Couldn't find prsr.sub[extraction_kit]")
                    obj.ext_kit = uses[0]
                add_widget.addItems(uses)
            case 'submitted_date':
                # create label
                obj.table_widget.formlayout.addWidget(QLabel(item.replace("_", " ").title()))
@@ -265,7 +269,7 @@ def submit_new_sample_function(obj:QMainWindow) -> QMainWindow:
    # reset form
    for item in obj.table_widget.formlayout.parentWidget().findChildren(QWidget):
        item.setParent(None)
-    print(dir(obj))
+    # print(dir(obj))
    if hasattr(obj, 'csv'):
        dlg = QuestionAsker("Export CSV?", "Would you like to export the csv file?")
        if dlg.exec():
@@ -426,6 +430,8 @@ def chart_maker_function(obj:QMainWindow) -> QMainWindow:
        # flatten data to one dimensional list
        data = [item for sublist in data for item in sublist]
        logger.debug(f"Control objects going into df conversion: {data}")
        if data == []:
            return obj, dict(status="Critical", message="No data found for controls in given date range.")
        # send to dataframe creator
        df = convert_data_list_to_df(ctx=obj.ctx, input=data, subtype=obj.subtype)
        if obj.subtype == None:
--- a/src/submissions/frontend/visualizations/control_charts.py
+++ b/src/submissions/frontend/visualizations/control_charts.py
@@ -39,7 +39,7 @@ def create_charts(ctx:dict, df:pd.DataFrame, ytitle:str|None=None) -> Figure:
            genera.append("")
    df['genus'] = df['genus'].replace({'\*':''}, regex=True).replace({"NaN":"Unknown"})
    df['genera'] = genera
-    df = df.dropna()
+    # df = df.dropna()
    # remove original runs, using reruns if applicable
    df = drop_reruns_from_df(ctx=ctx, df=df)
    # sort by and exclude from
@@ -49,6 +49,7 @@ def create_charts(ctx:dict, df:pd.DataFrame, ytitle:str|None=None) -> Figure:
    # Set descending for any columns that have "{mode}" in the header.
    ascending = [False if item == "target" else True for item in sorts]
    df = df.sort_values(by=sorts, ascending=ascending)
    logger.debug(df[df.isna().any(axis=1)])
    # actual chart construction is done by
    fig = construct_chart(ctx=ctx, df=df, modes=modes, ytitle=ytitle)
    return fig
@@ -245,6 +246,8 @@ def construct_kraken_chart(settings:dict, df:pd.DataFrame, group_name:str, mode:
        Figure: initial figure with traces for modes
    """    
    df[f'{mode}_count'] = pd.to_numeric(df[f'{mode}_count'],errors='coerce')
    df = df.groupby('submitted_date')[f'{mode}_count'].nlargest(2)
    # The actual percentage from kraken was off due to exclusion of NaN, recalculating.
    df[f'{mode}_percent'] = 100 * df[f'{mode}_count'] / df.groupby('submitted_date')[f'{mode}_count'].transform('sum')
    modes = settings['modes'][mode]