Before control chart update.

This commit is contained in:
lwark
2024-07-31 07:50:23 -05:00
parent 2a07265cbc
commit eb6cdc63e2
10 changed files with 305 additions and 396 deletions

View File

@@ -1,19 +1,29 @@
'''
"""
Functions for constructing controls graphs using plotly.
'''
TODO: Move these functions to widgets.controls_charts
"""
import re
import plotly
import plotly.express as px
import pandas as pd
from pandas import DataFrame
from plotly.graph_objects import Figure
import logging
# from backend.excel import get_unique_values_in_df_column
from tools import Settings, get_unique_values_in_df_column
from tools import Settings, get_unique_values_in_df_column, divide_chunks
from frontend.widgets.functions import select_save_file
logger = logging.getLogger(f"submissions.{__name__}")
def create_charts(ctx:Settings, df:pd.DataFrame, ytitle:str|None=None) -> Figure:
class CustomFigure(Figure):
def __init__(self, ctx: Settings, df: pd.DataFrame, ytitle: str | None = None):
super().__init__()
# NOTE: Start here.
def create_charts(ctx: Settings, df: pd.DataFrame, ytitle: str | None = None) -> Figure:
"""
Constructs figures based on parsed pandas dataframe.
@@ -24,8 +34,8 @@ def create_charts(ctx:Settings, df:pd.DataFrame, ytitle:str|None=None) -> Figure
Returns:
Figure: Plotly figure
"""
from backend.excel import drop_reruns_from_df
"""
# from backend.excel import drop_reruns_from_df
# converts starred genera to normal and splits off list of starred
genera = []
if df.empty:
@@ -33,28 +43,50 @@ def create_charts(ctx:Settings, df:pd.DataFrame, ytitle:str|None=None) -> Figure
for item in df['genus'].to_list():
try:
if item[-1] == "*":
genera.append(item[-1])
genera.append(item[-1])
else:
genera.append("")
except IndexError:
genera.append("")
df['genus'] = df['genus'].replace({'\*':''}, regex=True).replace({"NaN":"Unknown"})
df['genus'] = df['genus'].replace({'\*': ''}, regex=True).replace({"NaN": "Unknown"})
df['genera'] = genera
# remove original runs, using reruns if applicable
# NOTE: remove original runs, using reruns if applicable
df = drop_reruns_from_df(ctx=ctx, df=df)
# sort by and exclude from
# NOTE: sort by and exclude from
sorts = ['submitted_date', "target", "genus"]
exclude = ['name', 'genera']
modes = [item for item in df.columns if item not in sorts and item not in exclude]# and "_hashes" not in item]
# Set descending for any columns that have "{mode}" in the header.
modes = [item for item in df.columns if item not in sorts and item not in exclude] # and "_hashes" not in item]
# NOTE: Set descending for any columns that have "{mode}" in the header.
ascending = [False if item == "target" else True for item in sorts]
df = df.sort_values(by=sorts, ascending=ascending)
# logger.debug(df[df.isna().any(axis=1)])
# actual chart construction is done by
# NOTE: actual chart construction is done by
fig = construct_chart(df=df, modes=modes, ytitle=ytitle)
return fig
def generic_figure_markers(fig:Figure, modes:list=[], ytitle:str|None=None) -> Figure:
def drop_reruns_from_df(ctx: Settings, df: DataFrame) -> DataFrame:
"""
Removes semi-duplicates from dataframe after finding sequencing repeats.
Args:
settings (dict): settings passed from gui
df (DataFrame): initial dataframe
Returns:
DataFrame: dataframe with originals removed in favour of repeats.
"""
if 'rerun_regex' in ctx:
sample_names = get_unique_values_in_df_column(df, column_name="name")
rerun_regex = re.compile(fr"{ctx.rerun_regex}")
for sample in sample_names:
if rerun_regex.search(sample):
first_run = re.sub(rerun_regex, "", sample)
df = df.drop(df[df.name == first_run].index)
return df
def generic_figure_markers(fig: Figure, modes: list = [], ytitle: str | None = None) -> Figure:
"""
Adds standard layout to figure.
@@ -101,7 +133,8 @@ def generic_figure_markers(fig:Figure, modes:list=[], ytitle:str|None=None) -> F
assert type(fig) == Figure
return fig
def make_buttons(modes:list, fig_len:int) -> list:
def make_buttons(modes: list, fig_len: int) -> list:
"""
Creates list of buttons with one for each mode to be used in showing/hiding mode traces.
@@ -127,13 +160,14 @@ def make_buttons(modes:list, fig_len:int) -> list:
mode_vis = [item for sublist in mode_vis for item in sublist]
# Now, make button to add to list
buttons.append(dict(label=mode, method="update", args=[
{"visible": mode_vis},
{"yaxis.title.text": mode},
]
))
{"visible": mode_vis},
{"yaxis.title.text": mode},
]
))
return buttons
def output_figures(figs:list, group_name:str):
def output_figures(figs: list, group_name: str):
"""
Writes plotly figure to html file.
@@ -150,7 +184,8 @@ def output_figures(figs:list, group_name:str):
except AttributeError:
logger.error(f"The following figure was a string: {fig}")
def construct_chart(df:pd.DataFrame, modes:list, ytitle:str|None=None) -> Figure:
def construct_chart(df: pd.DataFrame, modes: list, ytitle: str | None = None) -> Figure:
"""
Creates a plotly chart for controls from a pandas dataframe
@@ -161,53 +196,40 @@ def construct_chart(df:pd.DataFrame, modes:list, ytitle:str|None=None) -> Figure
Returns:
Figure: output stacked bar chart.
"""
"""
fig = Figure()
for ii, mode in enumerate(modes):
if "count" in mode:
df[mode] = pd.to_numeric(df[mode],errors='coerce')
df[mode] = pd.to_numeric(df[mode], errors='coerce')
color = "genus"
color_discrete_sequence=None
color_discrete_sequence = None
elif 'percent' in mode:
color = "genus"
color_discrete_sequence=None
color_discrete_sequence = None
else:
color = "target"
match get_unique_values_in_df_column(df, 'target'):
case ['Target']:
color_discrete_sequence=["blue"]
color_discrete_sequence = ["blue"]
case ['Off-target']:
color_discrete_sequence=['red']
color_discrete_sequence = ['red']
case _:
color_discrete_sequence=['blue', 'red']
bar = px.bar(df, x="submitted_date",
y=mode,
color=color,
title=mode,
barmode='stack',
hover_data=["genus", "name", "target", mode],
text="genera",
color_discrete_sequence=color_discrete_sequence
)
bar.update_traces(visible = ii == 0)
color_discrete_sequence = ['blue', 'red']
bar = px.bar(df, x="submitted_date",
y=mode,
color=color,
title=mode,
barmode='stack',
hover_data=["genus", "name", "target", mode],
text="genera",
color_discrete_sequence=color_discrete_sequence
)
bar.update_traces(visible=ii == 0)
fig.add_traces(bar.data)
return generic_figure_markers(fig=fig, modes=modes, ytitle=ytitle)
def divide_chunks(input_list:list, chunk_count:int):
"""
Divides a list into {chunk_count} equal parts
Args:
input_list (list): Initials list
chunk_count (int): size of each chunk
Returns:
tuple: tuple containing sublists.
"""
k, m = divmod(len(input_list), chunk_count)
return (input_list[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(chunk_count))
def construct_html(figure:Figure) -> str:
def construct_html(figure: Figure) -> str:
"""
Creates final html code from plotly
@@ -216,10 +238,11 @@ def construct_html(figure:Figure) -> str:
Returns:
str: html string
"""
"""
html = '<html><body>'
if figure is not None:
html += plotly.offline.plot(figure, output_type='div', include_plotlyjs='cdn')#, image = 'png', auto_open=True, image_filename='plot_image')
html += plotly.offline.plot(figure, output_type='div',
include_plotlyjs='cdn') #, image = 'png', auto_open=True, image_filename='plot_image')
else:
html += "<h1>No data was retrieved for the given parameters.</h1>"
html += '</body></html>'

View File

@@ -1,6 +1,9 @@
'''
"""
Handles display of control charts
'''
"""
from datetime import timedelta
from typing import Tuple
from PyQt6.QtWebEngineWidgets import QWebEngineView
from PyQt6.QtWidgets import (
QWidget, QVBoxLayout, QComboBox, QHBoxLayout,
@@ -10,8 +13,9 @@ from PyQt6.QtCore import QSignalBlocker
from backend.db import ControlType, Control
from PyQt6.QtCore import QDate, QSize
import logging
from pandas import DataFrame
from tools import Report, Result
from backend.excel.reports import convert_data_list_to_df
# from backend.excel.reports import convert_data_list_to_df
from frontend.visualizations.control_charts import create_charts, construct_html
logger = logging.getLogger(f"submissions.{__name__}")
@@ -28,17 +32,17 @@ class ControlsViewer(QWidget):
# set tab2 layout
self.layout = QVBoxLayout(self)
self.control_typer = QComboBox()
# fetch types of controls
# NOTE: fetch types of controls
con_types = [item.name for item in ControlType.query()]
self.control_typer.addItems(con_types)
# create custom widget to get types of analysis
# NOTE: create custom widget to get types of analysis
self.mode_typer = QComboBox()
mode_types = Control.get_modes()
self.mode_typer.addItems(mode_types)
# create custom widget to get subtypes of analysis
# NOTE: create custom widget to get subtypes of analysis
self.sub_typer = QComboBox()
self.sub_typer.setEnabled(False)
# add widgets to tab2 layout
# NOTE: add widgets to tab2 layout
self.layout.addWidget(self.datepicker)
self.layout.addWidget(self.control_typer)
self.layout.addWidget(self.mode_typer)
@@ -118,8 +122,8 @@ class ControlsViewer(QWidget):
Tuple[QMainWindow, dict]: Collection of new main app window and result dict
"""
report = Report()
# logger.debug(f"Control getter context: \n\tControl type: {self.con_type}\n\tMode: {self.mode}\n\tStart Date: {self.start_date}\n\tEnd Date: {self.end_date}")
# NOTE: set the subtype for kraken
# logger.debug(f"Control getter context: \n\tControl type: {self.con_type}\n\tMode: {self.mode}\n\tStart
# Date: {self.start_date}\n\tEnd Date: {self.end_date}") NOTE: set the subtype for kraken
if self.sub_typer.currentText() == "":
self.subtype = None
else:
@@ -140,7 +144,7 @@ class ControlsViewer(QWidget):
self.report.add_result(Result(status="Critical", msg="No data found for controls in given date range."))
return
# NOTE send to dataframe creator
df = convert_data_list_to_df(input=data, subtype=self.subtype)
df = self.convert_data_list_to_df(input_df=data)
if self.subtype is None:
title = self.mode
else:
@@ -156,6 +160,116 @@ class ControlsViewer(QWidget):
# logger.debug("Figure updated... I hope.")
self.report.add_result(report)
def convert_data_list_to_df(self, input_df: list[dict]) -> DataFrame:
"""
Convert list of control records to dataframe
Args:
ctx (dict): settings passed from gui
input_df (list[dict]): list of dictionaries containing records
subtype (str | None, optional): name of submission type. Defaults to None.
Returns:
DataFrame: dataframe of controls
"""
df = DataFrame.from_records(input_df)
safe = ['name', 'submitted_date', 'genus', 'target']
for column in df.columns:
if "percent" in column:
count_col = [item for item in df.columns if "count" in item][0]
# NOTE: The actual percentage from kraken was off due to exclusion of NaN, recalculating.
df[column] = 100 * df[count_col] / df.groupby('name')[count_col].transform('sum')
if column not in safe:
if self.subtype is not None and column != self.subtype:
del df[column]
# NOTE: move date of sample submitted on same date as previous ahead one.
df = self.displace_date(df=df)
# NOTE: ad hoc method to make data labels more accurate.
df = self.df_column_renamer(df=df)
return df
def df_column_renamer(self, df: DataFrame) -> DataFrame:
"""
Ad hoc function I created to clarify some fields
Args:
df (DataFrame): input dataframe
Returns:
DataFrame: dataframe with 'clarified' column names
"""
df = df[df.columns.drop(list(df.filter(regex='_hashes')))]
return df.rename(columns={
"contains_ratio": "contains_shared_hashes_ratio",
"matches_ratio": "matches_shared_hashes_ratio",
"kraken_count": "kraken2_read_count_(top_50)",
"kraken_percent": "kraken2_read_percent_(top_50)"
})
def displace_date(self, df: DataFrame) -> DataFrame:
"""
This function serves to split samples that were submitted on the same date by incrementing dates.
It will shift the date forward by one day if it is the same day as an existing date in a list.
Args:
df (DataFrame): input dataframe composed of control records
Returns:
DataFrame: output dataframe with dates incremented.
"""
# logger.debug(f"Unique items: {df['name'].unique()}")
# NOTE: get submitted dates for each control
dict_list = [dict(name=item, date=df[df.name == item].iloc[0]['submitted_date']) for item in
sorted(df['name'].unique())]
previous_dates = []
for _, item in enumerate(dict_list):
df, previous_dates = self.check_date(df=df, item=item, previous_dates=previous_dates)
return df
def check_date(self, df: DataFrame, item: dict, previous_dates: list) -> Tuple[DataFrame, list]:
"""
Checks if an items date is already present in df and adjusts df accordingly
Args:
df (DataFrame): input dataframe
item (dict): control for checking
previous_dates (list): list of dates found in previous controls
Returns:
Tuple[DataFrame, list]: Output dataframe and appended list of previous dates
"""
try:
check = item['date'] in previous_dates
except IndexError:
check = False
previous_dates.append(item['date'])
if check:
# logger.debug(f"We found one! Increment date!\n\t{item['date']} to {item['date'] + timedelta(days=1)}")
# NOTE: get df locations where name == item name
mask = df['name'] == item['name']
# NOTE: increment date in dataframe
df.loc[mask, 'submitted_date'] = df.loc[mask, 'submitted_date'].apply(lambda x: x + timedelta(days=1))
item['date'] += timedelta(days=1)
passed = False
else:
passed = True
# logger.debug(f"\n\tCurrent date: {item['date']}\n\tPrevious dates:{previous_dates}")
# logger.debug(f"DF: {type(df)}, previous_dates: {type(previous_dates)}")
# NOTE: if run didn't lead to changed date, return values
if passed:
# logger.debug(f"Date check passed, returning.")
return df, previous_dates
# NOTE: if date was changed, rerun with new date
else:
logger.warning(f"Date check failed, running recursion")
df, previous_dates = self.check_date(df, item, previous_dates)
return df, previous_dates
class ControlsDatePicker(QWidget):
"""
custom widget to pick start and end dates for controls graphs