Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
78a6d04
fix issue with parsing empty sheets.
kraj593 May 8, 2020
d8772f0
removed unnecessary dependencies
kraj593 May 8, 2020
b9fd123
Update readme.md
kraj593 May 12, 2020
7b93231
Update readme.md
kraj593 May 12, 2020
4669a8a
update attribute names for cobrapy objects to account for deprecated …
kraj593 Feb 15, 2021
3802607
Merge remote-tracking branch 'origin/master'
kraj593 Feb 15, 2021
2a7b506
fixed an error in handling pd_series where duplicated elements are no…
kraj593 Mar 8, 2021
a29d002
fixes error in handling datasets shorter than savgol filter window
kraj593 Mar 8, 2021
729a9a7
Fixes issue with outlier removal - iterative outlier removal was prev…
kraj593 Mar 8, 2021
78cc88a
fixes issue with blank subtraction to store subtracted data in the ti…
kraj593 Mar 8, 2021
04045ac
treat blanks as constants for stat calculations. makes calculations m…
kraj593 Mar 8, 2021
4ca3153
accomodate OD700, OD660 as biomass analytes
kraj593 Mar 8, 2021
ebeedb3
accomodate OD700, OD660 as biomass analytes.
kraj593 Mar 8, 2021
40dc64c
fix issue with plotting error bars - error bars were previously plott…
kraj593 Mar 8, 2021
3360848
plotly.plotly deprecated
kraj593 May 12, 2021
fbddce5
fixes issue with exception being raised if curve fitting is enabled f…
kraj593 May 12, 2021
7b2e500
re-enable proper stdev calculations for features
kraj593 May 12, 2021
8224104
testing failing build with 3.6
kraj593 May 12, 2021
7c569ba
fixes issue with sqlalchemy's latest version.
kraj593 May 12, 2021
bca9868
can handle newer versions of plotly now
kraj593 May 12, 2021
18a50a3
cleaning up unused packages
kraj593 May 12, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions impact/core/Experiment.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
import sqlite3 as sql
import time

from .AnalyteData import TimeCourse, Biomass, Product, Substrate, Reporter
from .ReplicateTrial import ReplicateTrial
from .SingleTrial import SingleTrial

try:
from pyexcel_xlsx import get_data
except ImportError as e:
print('Could not import pyexcel')
print(e)
pass

from ..database import Base
from sqlalchemy import Column, Integer, ForeignKey, Float, Date, String, event
from sqlalchemy.orm import relationship
Expand Down Expand Up @@ -140,15 +132,21 @@ def calculate(self):
repstage.calculate()
print("Ran analysis in %0.1fs\n" % ((time.time() - t0)))

if settings.perform_curve_fit and 'OD600' in self.analyte_names:
biomass_analyte = None
if 'OD600' in self.analyte_names:
biomass_analyte = 'OD600'
elif 'OD700' in self.analyte_names:
biomass_analyte = 'OD700'

if settings.perform_curve_fit and biomass_analyte:
rep_list = [rep for rep in self.replicate_trials if
rep.trial_identifier.strain.name.lower() not in ['blank', 'none']]
rep_list = sorted(rep_list, key=lambda rep: str(rep.trial_identifier))
avg_list = []
error_list = []
for rep in rep_list:
avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
avg_growth = rep.avg.analyte_dict[biomass_analyte].fit_params['growth_rate'].parameter_value
std_growth = rep.std.analyte_dict[biomass_analyte].fit_params['growth_rate'].parameter_value
avg_list.append(avg_growth)
error_list.append(std_growth / avg_growth * 100)
max_growth_rate = max(avg_list)
Expand Down
27 changes: 13 additions & 14 deletions impact/core/ReplicateTrial.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,9 @@ def calculate_statistics(self):
self.avg.analyte_dict[analyte].pd_series = self.replicate_df[analyte].mean(axis=1)
self.std.analyte_dict[analyte].pd_series = self.replicate_df[analyte].std(axis=1)

#This is the right way to calculate standard deviations for blank subtraction. You must add the two variances
if self.blank_subtraction_flag and analyte in self.blank_subtracted_analytes:
self.std.analyte_dict[analyte].pd_series = np.sqrt(np.square(self.std.analyte_dict[analyte].pd_series)
+ np.square(self.blank.std.analyte_dict[analyte].pd_series))
self.std.analyte_dict[analyte].pd_series = np.sqrt(np.square(self.std.analyte_dict[analyte].pd_series)
+ np.square(self.blank.std.analyte_dict[analyte].pd_series))
#Assume that stdev for all values <0 is simply 0 since negative values are forced to be 0.
#Negative values of any analyte in this context is not possible
#self.std.analyte_dict[analyte].pd_series[self.avg.analyte_dict[analyte].pd_series<=0] = 0
Expand All @@ -257,13 +256,13 @@ def calculate_statistics(self):
if feature_data is not None:
single_trial_data.append(feature_data)
if self.blank:
with np.errstate(divide='ignore'):
with np.errstate(divide='ignore'):

temp_var = np.square(feature_data)*(np.square(self.blank.std.analyte_dict[analyte].pd_series\

/trial.analyte_dict[analyte].pd_series)+
/trial.analyte_dict[analyte].data_vector)+
np.square(self.blank.std.analyte_dict[biomass_analyte].pd_series
/trial.analyte_dict[biomass_analyte].pd_series))
/trial.analyte_dict[biomass_analyte].data_vector))
temp_var[trial.analyte_dict[analyte].pd_series == 0] = 0
temp_var[trial.analyte_dict[biomass_analyte].pd_series == 0] = 0
single_trial_var.append(temp_var)
Expand All @@ -277,7 +276,7 @@ def calculate_statistics(self):
# Variance on dataset due to blanks is average of individual standard deviation squared.
# Total variance is variance due to blanks + variance between individual normalized datapoints
if self.blank:
rep_var = sum(single_trial_var)/np.square(len(single_trial_var)) + rep_var
rep_var = sum(single_trial_var)/np.square(len(single_trial_var)) + rep_var

setattr(self.std.analyte_dict[analyte], feature.name, np.sqrt(rep_var).values)
setattr(self.avg.analyte_dict[analyte], feature.name, rep_mean)
Expand Down Expand Up @@ -379,7 +378,7 @@ def prune_bad_replicates(self, analyte): # Remove outliers
# A value between 0 and 1, > 1 means removing the replicate makes the yield worse

# df = pd.DataFrame({key: np.random.randn(5) for key in ['a', 'b', 'c']})
std_by_mean = np.mean(abs(backup.std(axis = 1)/backup.mean(axis = 1)))
std_by_mean = np.mean(abs(df.std(axis=1)/df.mean(axis=1)))
temp_std_by_mean = {}
for temp_remove_replicate in list(df.columns.values):
indices = [replicate for i, replicate in enumerate(df.columns.values) if
Expand All @@ -390,7 +389,7 @@ def prune_bad_replicates(self, analyte): # Remove outliers
temp_std_by_mean[temp_remove_replicate] = np.mean(abs(temp_std / temp_mean))

temp_min_val = min([temp_std_by_mean[key] for key in temp_std_by_mean])
if temp_min_val < std_deviation_cutoff and temp_min_val < std_by_mean:
if abs(temp_min_val-std_by_mean) >= std_deviation_cutoff and temp_min_val < std_by_mean:
bad_replicate_cols.append(
[key for key in temp_std_by_mean if temp_std_by_mean[key] == temp_min_val][0])

Expand Down Expand Up @@ -425,13 +424,13 @@ def substract_blank(self):
for single_trial in self.single_trials:
for blank_analyte in analytes_with_blanks:
if blank_analyte in single_trial.analyte_dict:
single_trial.analyte_dict[blank_analyte].data_vector = \
single_trial.analyte_dict[blank_analyte].data_vector \
- self.blank.avg.analyte_dict[blank_analyte].data_vector

single_trial.analyte_dict[blank_analyte].pd_series = \
single_trial.analyte_dict[blank_analyte].pd_series \
- self.blank.avg.analyte_dict[blank_analyte].pd_series
single_trial.analyte_dict[blank_analyte].generate_time_point_list()
#single_trial.analyte_dict[blank_analyte].data_vector = \
# single_trial.analyte_dict[blank_analyte].data_vector.clip(min = 0)
self.blank_subtracted_analytes.append(blank_analyte)
self.blank_subtracted_analytes.append(blank_analyte)
self.blank_subtracted_analytes = list(set(self.blank_subtracted_analytes))
self.blank_subtraction_flag = True

Expand Down
9 changes: 3 additions & 6 deletions impact/core/analytes/Base.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@ def calculate(self):
def find_death_phase(self, data_vector):
from ..settings import settings
use_filtered_data = settings.use_filtered_data
if len(self.pd_series) < self.savgol_filter_window_size:
use_filtered_data = False
verbose = settings.verbose
self.death_phase_start = self.find_death_phase_static(data_vector,
use_filtered_data = use_filtered_data,
Expand Down Expand Up @@ -312,7 +314,6 @@ def add_timepoint(self, time_point):
self.time_points.append(time_point)
if len(self.time_points) == 1:
self.trial_identifier = time_point.trial_identifier
self.pd_series = pd.Series([time_point.data],index=[time_point.time])
else:
if self.time_points[-1].trial_identifier.unique_single_trial() \
!= self.time_points[-2].trial_identifier.unique_single_trial():
Expand All @@ -326,11 +327,7 @@ def add_timepoint(self, time_point):
self.time_points.sort(key=lambda timePoint: timePoint.time)


if sum(self.pd_series.index.duplicated()) > 0:
print(self.pd_series)
print(self.trial_identifier)
print(time_point.trial_identifier)
raise Exception('Duplicate time points found, this is not supported - likely an identifier input error')


def curve_fit_data(self):
raise Exception('This must be implemented in a child')
Expand Down
4 changes: 1 addition & 3 deletions impact/core/analytes/Product.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,5 @@ def curve_fit_data(self):
from ..settings import settings
verbose = settings.verbose

if self.trial_identifier.analyte_type == 'product':
raise Exception('Product curve fitting not implemented')
else:
if self.trial_identifier.analyte_type != 'product':
raise Exception('Incorrect analyte_type')
12 changes: 6 additions & 6 deletions impact/helpers/synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ def generate_data(y0, t, model, biomass_keys, substrate_keys, product_keys, nois
sol = model.optimize()

# Let's assign the data to these variables
biomass_flux = [sol.x_dict[biomass_keys[0]]]
biomass_flux = [sol.fluxes[biomass_keys[0]]]

substrate_flux = [sol.x_dict[substrate_keys[0]]]
substrate_flux = [sol.fluxes[substrate_keys[0]]]

product_flux = [sol.x_dict[key] for key in product_keys]
product_flux = [sol.fluxes[key] for key in product_keys]

exchange_keys = biomass_keys + substrate_keys + product_keys

Expand Down Expand Up @@ -96,11 +96,11 @@ def dFBA_functions(y, t, t_max, model, analyte_dict, bar):
return [0]*len(exchange_keys)
else:
# Let's assign the data to these variables
biomass_flux = [model.solution.x_dict[biomass_keys[0]]]
biomass_flux = [solution.fluxes[biomass_keys[0]]]

substrate_flux = [model.solution.x_dict[substrate_keys[0]]]
substrate_flux = [solution.fluxes[substrate_keys[0]]]

product_flux = [model.solution.x_dict[key] for key in product_keys]
product_flux = [solution.fluxes[key] for key in product_keys]

exchange_keys = biomass_keys + substrate_keys + product_keys

Expand Down
11 changes: 9 additions & 2 deletions impact/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ def parse_raw_data(cls, data_format=None, id_type='traverse', file_name=None, da

# Extract data from sheets
data = {sheet.title: [[elem.value if elem is not None else None for elem in row]
for row in xls_data[sheet.title]] for sheet in xls_data}
for row in xls_data[sheet.title]]
if xls_data[sheet.title].max_row > 1 else None for sheet in xls_data}

if data_format in cls.parser_case_dict.keys():
cls.parser_case_dict[data_format](experiment, data=data, id_type=id_type, plate_type = plate_type)
Expand Down Expand Up @@ -415,7 +416,8 @@ def parse_raw_data(format=None, id_type='CSV', file_name=None, data=None, experi

# Extract data from sheets
data = {sheet.title: [[elem.value if elem is not None else None for elem in row]
for row in xls_data[sheet.title]] for sheet in xls_data}
for row in xls_data[sheet.title]]
if xls_data[sheet.title].max_row > 1 else None for sheet in xls_data}

# Import parsers
parser_case_dict = {'spectromax_OD' : spectromax_OD,
Expand Down Expand Up @@ -473,6 +475,11 @@ def parse_time_point_list(time_point_list):
for analyte in analyte_dict.values():
analyte.pd_series = pd.Series([timePoint.data for timePoint in analyte.time_points],
index=[timePoint.time for timePoint in analyte.time_points])
if sum(analyte.pd_series.index.duplicated()) > 0:
print(analyte.pd_series)
print(analyte.trial_identifier)
print(analyte.pd_series[analyte.pd_series.index.duplicated()])
raise Exception('Duplicate time points found, this is not supported - likely an identifier input error')

tf = sys_time.time()
print("Parsed %i time points in %0.1fs" % (len(time_point_list), (tf - t0)))
Expand Down
20 changes: 11 additions & 9 deletions impact/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@

from plotly import tools
import plotly.graph_objs as go
import plotly.plotly as py
import colorlover as cl
import math

Expand Down Expand Up @@ -291,7 +290,7 @@ def printGenericTimeCourse_plotly(replicateTrialList=None, dbName=None, strainsT
else:
y_avg = [replicate.avg.analyte_dict[product].data_vector for replicate in replicateTrialList
if getattr(replicate.trial_identifier, sortBy) == unique or sort_by_flag is False]
y_std = [replicate.std.analyte_dict[product].data_vector for replicate in replicateTrialList
y_std = [replicate.std.analyte_dict[product].pd_series.values for replicate in replicateTrialList
if getattr(replicate.trial_identifier, sortBy) == unique or sort_by_flag is False]
label = ' titer (g/L)'

Expand Down Expand Up @@ -434,7 +433,6 @@ def render_output_ploty(output_type, fig, number_of_columns=None, column_width_m
plotly.tools.set_credentials_file(username=plotly_username, api_key=plotly_api_key)
fig['layout'].update(width=number_of_columns * column_width_multiplier)
random_file_name = ''.join(random.choice(string.ascii_letters) for _ in range(10)) + '.png'
py.image.save_as(fig, random_file_name, scale=img_scale)
return random_file_name


Expand Down Expand Up @@ -491,7 +489,7 @@ def print_generic_timecourse_plotly(replicate_trial_list, product, colors, pts_p
if product != 'OD600':
dataLabel = '<br>titer (g/L)'
y_avg = replicate.avg.analyte_dict[product].data_vector[::removePointFraction]
y_std = replicate.std.analyte_dict[product].data_vector[::removePointFraction]
y_std = replicate.std.analyte_dict[product].pd_series.values[::removePointFraction]
elif normalize_to is not None:
y_avg = replicate.avg.analyte_dict[product].get_normalized_data(normalize_to)[
::removePointFraction]
Expand Down Expand Up @@ -661,7 +659,7 @@ def time_profile_traces(replicate_trials=None, feature=None, analyte='OD600', co

if colors is None:
colors = get_colors(len(replicate_trials), colors=colors, cl_scales=cl_scales)

replicate_trials = sorted(replicate_trials, key=lambda rep: str(rep.trial_identifier))
for index, replicate in enumerate(replicate_trials):
# Determine how many points should be plotted
required_num_pts = replicate.t[-1] * pts_per_hour
Expand All @@ -672,7 +670,7 @@ def time_profile_traces(replicate_trials=None, feature=None, analyte='OD600', co
y=replicate.avg.analyte_dict[analyte].data_vector[::removePointFraction],
error_y={
'type' : 'data',
'array' : replicate.std.analyte_dict[analyte].data_vector[::removePointFraction],
'array' : replicate.std.analyte_dict[analyte].pd_series.values[::removePointFraction],
'visible': True,
'color' : colors[index]},
# mode=mode,
Expand Down Expand Up @@ -735,7 +733,7 @@ def analyte_bar_trace(replicate_trials=None, feature=None, analyte='OD600', colo
print("That is not a valid point")
return None
data_point = replicate.avg.analyte_dict[analyte].data_vector[index_to_plot]
error = replicate.std.analyte_dict[analyte].data_vector[index_to_plot]
error = replicate.std.analyte_dict[analyte].pd_series.values[index_to_plot]

x_list.append(str(replicate.trial_identifier.strain) +" in " + str(replicate.trial_identifier.media))
y_list.append(data_point)
Expand Down Expand Up @@ -1407,8 +1405,12 @@ def plot_growth_curve_fit(expt=None, format=None):
fig['layout'].update(title='Growth curve fit for ' + str(rep.trial_identifier))
fig['layout']['yaxis1'].update(title='OD600')
plot(fig, image=format)
avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
if 'OD600' in rep.avg.analyte_dict:
avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
elif 'OD700' in rep.avg.analyte_dict:
avg_growth = rep.avg.analyte_dict['OD700'].fit_params['growth_rate'].parameter_value
std_growth = rep.std.analyte_dict['OD700'].fit_params['growth_rate'].parameter_value
print("\u03BC\u2090\u1D65 = %3.3f \u00B1 %3.3f /h" % (avg_growth, std_growth))
else:
print("Curve fitting was not implemented for this experiment. Please check Impact settings.")
Expand Down
4 changes: 3 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

# Impact: a framework for analyzing microbial physiology

Associated with the research article: "Impact framework: A python package for writing data analysis workflows to interpret microbial physiology" https://doi.org/10.1016/j.mec.2019.e00089.

Impact assists scientists and engineers to interpret data describing microbial physiology.
Impact parses raw data from common equipment such as chromatography (HPLC),
and plate readers. Data is automatically interpreted and built into hierarchical objects
Expand Down Expand Up @@ -51,4 +53,4 @@ Impact comes with scripts that test the proper functioning of the package. These
The documentation is available in `docs` or a rendered version is available [here](http://impact.readthedocs.io/en/latest/)

## Starter Files
A starter ipynb which can be opened with Jupyter notebook has been provided in the Examples_and_Helpers folders. The file comes with comments which will assist users in analyzing their data. A helper file to create trial identifiers has also been provided in the Examples_and_Helpers folder.
A starter ipynb which can be opened with Jupyter notebook has been provided in the Examples_and_Helpers folders. The file comes with comments which will assist users in analyzing their data. A helper file to create trial identifiers has also been provided in the Examples_and_Helpers folder.
6 changes: 2 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
sqlalchemy
sqlalchemy==1.3.23
numpy>=1.10.4
pandas
scipy>=0.17.0
lmfit==0.8.3
openpyxl==2.5.3
tabulate
pyexcel-xlsx
openpyxl==2.5.3
2 changes: 1 addition & 1 deletion requirements_plotting.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
plotly==2.7.0
plotly
colorlover
matplotlib>=1.5.1