Checking kit integrity on import.

This commit is contained in:
Landon Wark
2023-03-03 15:06:43 -06:00
parent 1c89c31d25
commit 82d5378479
14 changed files with 376 additions and 95 deletions

View File

@@ -576,17 +576,23 @@ def get_all_controls_by_type(ctx:dict, con_type:str, start_date:date|None=None,
list: Control instances.
"""
# logger.debug(f"Using dates: {start_date} to {end_date}")
query = ctx['database_session'].query(models.ControlType).filter_by(name=con_type)
try:
output = query.first().instances
except AttributeError:
output = None
# Hacky solution to my not being able to get the sql query to work.
logger.debug(f"Using dates: {start_date} to {end_date}")
if start_date != None and end_date != None:
output = [item for item in output if item.submitted_date.date() > start_date and item.submitted_date.date() < end_date]
# logger.debug(f"Type {con_type}: {query.first()}")
output = ctx['database_session'].query(models.Control).join(models.ControlType).filter_by(name=con_type).filter(models.Control.submitted_date.between(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))).all()
else:
output = ctx['database_session'].query(models.Control).join(models.ControlType).filter_by(name=con_type).all()
logger.debug(f"Returned controls between dates: {output}")
return output
# query = ctx['database_session'].query(models.ControlType).filter_by(name=con_type)
# try:
# output = query.first().instances
# except AttributeError:
# output = None
# # Hacky solution to my not being able to get the sql query to work.
# if start_date != None and end_date != None:
# output = [item for item in output if item.submitted_date.date() > start_date and item.submitted_date.date() < end_date]
# # logger.debug(f"Type {con_type}: {query.first()}")
# return output
def get_control_subtypes(ctx:dict, type:str, mode:str) -> list[str]:

View File

@@ -39,9 +39,18 @@ class Control(Base):
# UniqueConstraint('name', name='uq_control_name')
submission_id = Column(INTEGER, ForeignKey("_submissions.id")) #: parent submission id
submission = relationship("BacterialCulture", back_populates="controls", foreign_keys=[submission_id]) #: parent submission
refseq_version = Column(String(16))
kraken2_version = Column(String(16))
kraken2_db_version = Column(String(32))
def to_sub_dict(self):
def to_sub_dict(self) -> dict:
"""
Converts object into convenient dictionary for use in submission summary
Returns:
dict: output dictionary containing: Name, Type, Targets, Top Kraken results
"""
kraken = json.loads(self.kraken)
kraken_cnt_total = sum([kraken[item]['kraken_count'] for item in kraken])
new_kraken = []
@@ -61,3 +70,46 @@ class Control(Base):
}
return output
def convert_by_mode(self, mode:str) -> list[dict]:
"""
split control object into analysis types
Args:
control (models.Control): control to be parsed into list
mode (str): analysis type
Returns:
list[dict]: list of records
"""
output = []
data = json.loads(getattr(self, mode))
# if len(data) == 0:
# data = self.create_dummy_data(mode)
logger.debug(f"Length of data: {len(data)}")
for genus in data:
_dict = {}
_dict['name'] = self.name
_dict['submitted_date'] = self.submitted_date
_dict['genus'] = genus
_dict['target'] = 'Target' if genus.strip("*") in self.controltype.targets else "Off-target"
for key in data[genus]:
_dict[key] = data[genus][key]
if _dict[key] == {}:
print(self.name, mode)
output.append(_dict)
# logger.debug(output)
return output
def create_dummy_data(self, mode):
match mode:
case "contains":
data = {"Nothing": {"contains_hashes":"0/400", "contains_ratio":0.0}}
case "matches":
data = {"Nothing": {"matches_hashes":"0/400", "matches_ratio":0.0}}
case "kraken":
data = {"Nothing": {"kraken_percent":0.0, "kraken_count":0}}
case _:
data = {}
return data

View File

@@ -4,6 +4,7 @@ from sqlalchemy.orm import relationship
from datetime import datetime as dt
import logging
import json
from json.decoder import JSONDecodeError
logger = logging.getLogger(f"submissions.{__name__}")
@@ -74,6 +75,9 @@ class BasicSubmission(Base):
ext_info = json.loads(self.extraction_info)
except TypeError:
ext_info = None
except JSONDecodeError as e:
ext_info = None
logger.debug(f"Json error in {self.rsl_plate_num}: {e}")
try:
reagents = [item.to_sub_dict() for item in self.reagents]
except:

View File

@@ -1,10 +1,11 @@
from pandas import DataFrame, concat
from backend.db import models
from operator import itemgetter
# from backend.db import models
import json
import logging
from jinja2 import Environment, FileSystemLoader
from datetime import date
from datetime import date, timedelta
import sys
from pathlib import Path
@@ -139,31 +140,32 @@ def make_report_html(df:DataFrame, start_date:date, end_date:date) -> str:
# dfs['name'] = df
# return dfs
def convert_control_by_mode(ctx:dict, control:models.Control, mode:str) -> list[dict]:
"""
split control object into analysis types
# def convert_control_by_mode(ctx:dict, control:models.Control, mode:str) -> list[dict]:
# """
# split control object into analysis types... can I move this into the class itself?
# turns out I can
Args:
ctx (dict): settings passed from gui
control (models.Control): control to be parsed into list
mode (str): analysis type
# Args:
# ctx (dict): settings passed from gui
# control (models.Control): control to be parsed into list
# mode (str): analysis type
Returns:
list[dict]: list of records
"""
output = []
data = json.loads(getattr(control, mode))
for genus in data:
_dict = {}
_dict['name'] = control.name
_dict['submitted_date'] = control.submitted_date
_dict['genus'] = genus
_dict['target'] = 'Target' if genus.strip("*") in control.controltype.targets else "Off-target"
for key in data[genus]:
_dict[key] = data[genus][key]
output.append(_dict)
# logger.debug(output)
return output
# Returns:
# list[dict]: list of records
# """
# output = []
# data = json.loads(getattr(control, mode))
# for genus in data:
# _dict = {}
# _dict['name'] = control.name
# _dict['submitted_date'] = control.submitted_date
# _dict['genus'] = genus
# _dict['target'] = 'Target' if genus.strip("*") in control.controltype.targets else "Off-target"
# for key in data[genus]:
# _dict[key] = data[genus][key]
# output.append(_dict)
# # logger.debug(output)
# return output
def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -> DataFrame:
@@ -178,17 +180,81 @@ def convert_data_list_to_df(ctx:dict, input:list[dict], subtype:str|None=None) -
Returns:
DataFrame: _description_
"""
# copy = input
# for item in copy:
# item['submitted_date'] = item['submitted_date'].strftime("%Y-%m-%d")
# with open("controls.json", "w") as f:
# f.write(json.dumps(copy))
# for item in input:
# logger.debug(item.keys())
df = DataFrame.from_records(input)
df.to_excel("test.xlsx", engine="openpyxl")
safe = ['name', 'submitted_date', 'genus', 'target']
# logger.debug(df)
for column in df.columns:
if "percent" in column:
count_col = [item for item in df.columns if "count" in item][0]
# The actual percentage from kraken was off due to exclusion of NaN, recalculating.
df[column] = 100 * df[count_col] / df.groupby('submitted_date')[count_col].transform('sum')
# df[column] = 100 * df[count_col] / df.groupby('submitted_date')[count_col].transform('sum')
df[column] = 100 * df[count_col] / df.groupby('name')[count_col].transform('sum')
if column not in safe:
if subtype != None and column != subtype:
del df[column]
# logger.debug(df)
# df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl")
df = displace_date(df)
df.sort_values('submitted_date').to_excel("controls.xlsx", engine="openpyxl")
df = df_column_renamer(df=df)
return df
def df_column_renamer(df:DataFrame) -> DataFrame:
"""
Ad hoc function I created to clarify some fields
Args:
df (DataFrame): input dataframe
Returns:
DataFrame: dataframe with 'clarified' column names
"""
df = df[df.columns.drop(list(df.filter(regex='_hashes')))]
return df.rename(columns = {
"contains_ratio":"contains_shared_hashes_ratio",
"matches_ratio":"matches_shared_hashes_ratio",
"kraken_count":"kraken2_read_count",
"kraken_percent":"kraken2_read_percent"
})
def displace_date(df:DataFrame) -> DataFrame:
"""
This function serves to split samples that were submitted on the same date by incrementing dates.
Args:
df (DataFrame): input dataframe composed of control records
Returns:
DataFrame: output dataframe with dates incremented.
"""
# dict_list = []
# for item in df['name'].unique():
# dict_list.append(dict(name=item, date=df[df.name == item].iloc[0]['submitted_date']))
logger.debug(f"Unique items: {df['name'].unique()}")
# logger.debug(df.to_string())
# the assumption is that closest names will have closest dates...
dict_list = [dict(name=item, date=df[df.name == item].iloc[0]['submitted_date']) for item in sorted(df['name'].unique())]
for ii, item in enumerate(dict_list):
# if ii > 0:
try:
check = item['date'] == dict_list[ii-1]['date']
except IndexError:
check = False
if check:
logger.debug(f"We found one! Increment date!\n{item['date'] - timedelta(days=1)}")
mask = df['name'] == item['name']
# logger.debug(f"We will increment dates in: {df.loc[mask, 'submitted_date']}")
df.loc[mask, 'submitted_date'] = df.loc[mask, 'submitted_date'].apply(lambda x: x + timedelta(days=1))
# logger.debug(f"Do these look incremented: {df.loc[mask, 'submitted_date']}")
return df