diff --git a/impact/core/Experiment.py b/impact/core/Experiment.py index c9977aa..f57c5e2 100644 --- a/impact/core/Experiment.py +++ b/impact/core/Experiment.py @@ -1,17 +1,9 @@ -import sqlite3 as sql import time from .AnalyteData import TimeCourse, Biomass, Product, Substrate, Reporter from .ReplicateTrial import ReplicateTrial from .SingleTrial import SingleTrial -try: - from pyexcel_xlsx import get_data -except ImportError as e: - print('Could not import pyexcel') - print(e) - pass - from ..database import Base from sqlalchemy import Column, Integer, ForeignKey, Float, Date, String, event from sqlalchemy.orm import relationship @@ -140,15 +132,21 @@ def calculate(self): repstage.calculate() print("Ran analysis in %0.1fs\n" % ((time.time() - t0))) - if settings.perform_curve_fit and 'OD600' in self.analyte_names: + biomass_analyte = None + if 'OD600' in self.analyte_names: + biomass_analyte = 'OD600' + elif 'OD700' in self.analyte_names: + biomass_analyte = 'OD700' + + if settings.perform_curve_fit and biomass_analyte: rep_list = [rep for rep in self.replicate_trials if rep.trial_identifier.strain.name.lower() not in ['blank', 'none']] rep_list = sorted(rep_list, key=lambda rep: str(rep.trial_identifier)) avg_list = [] error_list = [] for rep in rep_list: - avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value - std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value + avg_growth = rep.avg.analyte_dict[biomass_analyte].fit_params['growth_rate'].parameter_value + std_growth = rep.std.analyte_dict[biomass_analyte].fit_params['growth_rate'].parameter_value avg_list.append(avg_growth) error_list.append(std_growth / avg_growth * 100) max_growth_rate = max(avg_list) diff --git a/impact/core/ReplicateTrial.py b/impact/core/ReplicateTrial.py index 90a8ae3..acc645b 100644 --- a/impact/core/ReplicateTrial.py +++ b/impact/core/ReplicateTrial.py @@ -228,10 +228,9 @@ def calculate_statistics(self): self.avg.analyte_dict[analyte].pd_series = self.replicate_df[analyte].mean(axis=1) self.std.analyte_dict[analyte].pd_series = self.replicate_df[analyte].std(axis=1) - #This is the right way to calculate standard deviations for blank subtraction. You must add the two variances if self.blank_subtraction_flag and analyte in self.blank_subtracted_analytes: - self.std.analyte_dict[analyte].pd_series = np.sqrt(np.square(self.std.analyte_dict[analyte].pd_series) - + np.square(self.blank.std.analyte_dict[analyte].pd_series)) + self.std.analyte_dict[analyte].pd_series = np.sqrt(np.square(self.std.analyte_dict[analyte].pd_series) + + np.square(self.blank.std.analyte_dict[analyte].pd_series)) #Assume that stdev for all values <0 is simply 0 since negative values are forced to be 0. #Negative values of any analyte in this context is not possible #self.std.analyte_dict[analyte].pd_series[self.avg.analyte_dict[analyte].pd_series<=0] = 0 @@ -257,13 +256,13 @@ def calculate_statistics(self): if feature_data is not None: single_trial_data.append(feature_data) if self.blank: - with np.errstate(divide='ignore'): + with np.errstate(divide='ignore'): temp_var = np.square(feature_data)*(np.square(self.blank.std.analyte_dict[analyte].pd_series\ - /trial.analyte_dict[analyte].pd_series)+ + /trial.analyte_dict[analyte].data_vector)+ np.square(self.blank.std.analyte_dict[biomass_analyte].pd_series - /trial.analyte_dict[biomass_analyte].pd_series)) + /trial.analyte_dict[biomass_analyte].data_vector)) temp_var[trial.analyte_dict[analyte].pd_series == 0] = 0 temp_var[trial.analyte_dict[biomass_analyte].pd_series == 0] = 0 single_trial_var.append(temp_var) @@ -277,7 +276,7 @@ def calculate_statistics(self): # Variance on dataset due to blanks is average of individual standard deviation squared. # Total variance is variance due to blanks + variance between individual normalized datapoints if self.blank: - rep_var = sum(single_trial_var)/np.square(len(single_trial_var)) + rep_var + rep_var = sum(single_trial_var)/np.square(len(single_trial_var)) + rep_var setattr(self.std.analyte_dict[analyte], feature.name, np.sqrt(rep_var).values) setattr(self.avg.analyte_dict[analyte], feature.name, rep_mean) @@ -379,7 +378,7 @@ def prune_bad_replicates(self, analyte): # Remove outliers # A value between 0 and 1, > 1 means removing the replicate makes the yield worse # df = pd.DataFrame({key: np.random.randn(5) for key in ['a', 'b', 'c']}) - std_by_mean = np.mean(abs(backup.std(axis = 1)/backup.mean(axis = 1))) + std_by_mean = np.mean(abs(df.std(axis=1)/df.mean(axis=1))) temp_std_by_mean = {} for temp_remove_replicate in list(df.columns.values): indices = [replicate for i, replicate in enumerate(df.columns.values) if @@ -390,7 +389,7 @@ def prune_bad_replicates(self, analyte): # Remove outliers temp_std_by_mean[temp_remove_replicate] = np.mean(abs(temp_std / temp_mean)) temp_min_val = min([temp_std_by_mean[key] for key in temp_std_by_mean]) - if temp_min_val < std_deviation_cutoff and temp_min_val < std_by_mean: + if abs(temp_min_val-std_by_mean) >= std_deviation_cutoff and temp_min_val < std_by_mean: bad_replicate_cols.append( [key for key in temp_std_by_mean if temp_std_by_mean[key] == temp_min_val][0]) @@ -425,13 +424,13 @@ def substract_blank(self): for single_trial in self.single_trials: for blank_analyte in analytes_with_blanks: if blank_analyte in single_trial.analyte_dict: - single_trial.analyte_dict[blank_analyte].data_vector = \ - single_trial.analyte_dict[blank_analyte].data_vector \ - - self.blank.avg.analyte_dict[blank_analyte].data_vector - + single_trial.analyte_dict[blank_analyte].pd_series = \ + single_trial.analyte_dict[blank_analyte].pd_series \ + - self.blank.avg.analyte_dict[blank_analyte].pd_series + single_trial.analyte_dict[blank_analyte].generate_time_point_list() #single_trial.analyte_dict[blank_analyte].data_vector = \ # single_trial.analyte_dict[blank_analyte].data_vector.clip(min = 0) - self.blank_subtracted_analytes.append(blank_analyte) + self.blank_subtracted_analytes.append(blank_analyte) self.blank_subtracted_analytes = list(set(self.blank_subtracted_analytes)) self.blank_subtraction_flag = True diff --git a/impact/core/analytes/Base.py b/impact/core/analytes/Base.py index 66f7d87..23166c0 100644 --- a/impact/core/analytes/Base.py +++ b/impact/core/analytes/Base.py @@ -245,6 +245,8 @@ def calculate(self): def find_death_phase(self, data_vector): from ..settings import settings use_filtered_data = settings.use_filtered_data + if len(self.pd_series) < self.savgol_filter_window_size: + use_filtered_data = False verbose = settings.verbose self.death_phase_start = self.find_death_phase_static(data_vector, use_filtered_data = use_filtered_data, @@ -312,7 +314,6 @@ def add_timepoint(self, time_point): self.time_points.append(time_point) if len(self.time_points) == 1: self.trial_identifier = time_point.trial_identifier - self.pd_series = pd.Series([time_point.data],index=[time_point.time]) else: if self.time_points[-1].trial_identifier.unique_single_trial() \ != self.time_points[-2].trial_identifier.unique_single_trial(): @@ -326,11 +327,7 @@ def add_timepoint(self, time_point): self.time_points.sort(key=lambda timePoint: timePoint.time) - if sum(self.pd_series.index.duplicated()) > 0: - print(self.pd_series) - print(self.trial_identifier) - print(time_point.trial_identifier) - raise Exception('Duplicate time points found, this is not supported - likely an identifier input error') + def curve_fit_data(self): raise Exception('This must be implemented in a child') diff --git a/impact/core/analytes/Product.py b/impact/core/analytes/Product.py index a305f4c..486edb0 100644 --- a/impact/core/analytes/Product.py +++ b/impact/core/analytes/Product.py @@ -15,7 +15,5 @@ def curve_fit_data(self): from ..settings import settings verbose = settings.verbose - if self.trial_identifier.analyte_type == 'product': - raise Exception('Product curve fitting not implemented') - else: + if self.trial_identifier.analyte_type != 'product': raise Exception('Incorrect analyte_type') diff --git a/impact/helpers/synthetic_data.py b/impact/helpers/synthetic_data.py index 5ff88e7..70ecfe0 100644 --- a/impact/helpers/synthetic_data.py +++ b/impact/helpers/synthetic_data.py @@ -35,11 +35,11 @@ def generate_data(y0, t, model, biomass_keys, substrate_keys, product_keys, nois sol = model.optimize() # Let's assign the data to these variables - biomass_flux = [sol.x_dict[biomass_keys[0]]] + biomass_flux = [sol.fluxes[biomass_keys[0]]] - substrate_flux = [sol.x_dict[substrate_keys[0]]] + substrate_flux = [sol.fluxes[substrate_keys[0]]] - product_flux = [sol.x_dict[key] for key in product_keys] + product_flux = [sol.fluxes[key] for key in product_keys] exchange_keys = biomass_keys + substrate_keys + product_keys @@ -96,11 +96,11 @@ def dFBA_functions(y, t, t_max, model, analyte_dict, bar): return [0]*len(exchange_keys) else: # Let's assign the data to these variables - biomass_flux = [model.solution.x_dict[biomass_keys[0]]] + biomass_flux = [solution.fluxes[biomass_keys[0]]] - substrate_flux = [model.solution.x_dict[substrate_keys[0]]] + substrate_flux = [solution.fluxes[substrate_keys[0]]] - product_flux = [model.solution.x_dict[key] for key in product_keys] + product_flux = [solution.fluxes[key] for key in product_keys] exchange_keys = biomass_keys + substrate_keys + product_keys diff --git a/impact/parsers.py b/impact/parsers.py index 2f76fe7..7c43ca9 100644 --- a/impact/parsers.py +++ b/impact/parsers.py @@ -357,7 +357,8 @@ def parse_raw_data(cls, data_format=None, id_type='traverse', file_name=None, da # Extract data from sheets data = {sheet.title: [[elem.value if elem is not None else None for elem in row] - for row in xls_data[sheet.title]] for sheet in xls_data} + for row in xls_data[sheet.title]] + if xls_data[sheet.title].max_row > 1 else None for sheet in xls_data} if data_format in cls.parser_case_dict.keys(): cls.parser_case_dict[data_format](experiment, data=data, id_type=id_type, plate_type = plate_type) @@ -415,7 +416,8 @@ def parse_raw_data(format=None, id_type='CSV', file_name=None, data=None, experi # Extract data from sheets data = {sheet.title: [[elem.value if elem is not None else None for elem in row] - for row in xls_data[sheet.title]] for sheet in xls_data} + for row in xls_data[sheet.title]] + if xls_data[sheet.title].max_row > 1 else None for sheet in xls_data} # Import parsers parser_case_dict = {'spectromax_OD' : spectromax_OD, @@ -473,6 +475,11 @@ def parse_time_point_list(time_point_list): for analyte in analyte_dict.values(): analyte.pd_series = pd.Series([timePoint.data for timePoint in analyte.time_points], index=[timePoint.time for timePoint in analyte.time_points]) + if sum(analyte.pd_series.index.duplicated()) > 0: + print(analyte.pd_series) + print(analyte.trial_identifier) + print(analyte.pd_series[analyte.pd_series.index.duplicated()]) + raise Exception('Duplicate time points found, this is not supported - likely an identifier input error') tf = sys_time.time() print("Parsed %i time points in %0.1fs" % (len(time_point_list), (tf - t0))) diff --git a/impact/plotting.py b/impact/plotting.py index 00723b0..1e2f8d6 100644 --- a/impact/plotting.py +++ b/impact/plotting.py @@ -44,7 +44,6 @@ from plotly import tools import plotly.graph_objs as go -import plotly.plotly as py import colorlover as cl import math @@ -291,7 +290,7 @@ def printGenericTimeCourse_plotly(replicateTrialList=None, dbName=None, strainsT else: y_avg = [replicate.avg.analyte_dict[product].data_vector for replicate in replicateTrialList if getattr(replicate.trial_identifier, sortBy) == unique or sort_by_flag is False] - y_std = [replicate.std.analyte_dict[product].data_vector for replicate in replicateTrialList + y_std = [replicate.std.analyte_dict[product].pd_series.values for replicate in replicateTrialList if getattr(replicate.trial_identifier, sortBy) == unique or sort_by_flag is False] label = ' titer (g/L)' @@ -434,7 +433,6 @@ def render_output_ploty(output_type, fig, number_of_columns=None, column_width_m plotly.tools.set_credentials_file(username=plotly_username, api_key=plotly_api_key) fig['layout'].update(width=number_of_columns * column_width_multiplier) random_file_name = ''.join(random.choice(string.ascii_letters) for _ in range(10)) + '.png' - py.image.save_as(fig, random_file_name, scale=img_scale) return random_file_name @@ -491,7 +489,7 @@ def print_generic_timecourse_plotly(replicate_trial_list, product, colors, pts_p if product != 'OD600': dataLabel = '
titer (g/L)' y_avg = replicate.avg.analyte_dict[product].data_vector[::removePointFraction] - y_std = replicate.std.analyte_dict[product].data_vector[::removePointFraction] + y_std = replicate.std.analyte_dict[product].pd_series.values[::removePointFraction] elif normalize_to is not None: y_avg = replicate.avg.analyte_dict[product].get_normalized_data(normalize_to)[ ::removePointFraction] @@ -661,7 +659,7 @@ def time_profile_traces(replicate_trials=None, feature=None, analyte='OD600', co if colors is None: colors = get_colors(len(replicate_trials), colors=colors, cl_scales=cl_scales) - + replicate_trials = sorted(replicate_trials, key=lambda rep: str(rep.trial_identifier)) for index, replicate in enumerate(replicate_trials): # Determine how many points should be plotted required_num_pts = replicate.t[-1] * pts_per_hour @@ -672,7 +670,7 @@ def time_profile_traces(replicate_trials=None, feature=None, analyte='OD600', co y=replicate.avg.analyte_dict[analyte].data_vector[::removePointFraction], error_y={ 'type' : 'data', - 'array' : replicate.std.analyte_dict[analyte].data_vector[::removePointFraction], + 'array' : replicate.std.analyte_dict[analyte].pd_series.values[::removePointFraction], 'visible': True, 'color' : colors[index]}, # mode=mode, @@ -735,7 +733,7 @@ def analyte_bar_trace(replicate_trials=None, feature=None, analyte='OD600', colo print("That is not a valid point") return None data_point = replicate.avg.analyte_dict[analyte].data_vector[index_to_plot] - error = replicate.std.analyte_dict[analyte].data_vector[index_to_plot] + error = replicate.std.analyte_dict[analyte].pd_series.values[index_to_plot] x_list.append(str(replicate.trial_identifier.strain) +" in " + str(replicate.trial_identifier.media)) y_list.append(data_point) @@ -1407,8 +1405,12 @@ def plot_growth_curve_fit(expt=None, format=None): fig['layout'].update(title='Growth curve fit for ' + str(rep.trial_identifier)) fig['layout']['yaxis1'].update(title='OD600') plot(fig, image=format) - avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value - std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value + if 'OD600' in rep.avg.analyte_dict: + avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value + std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value + elif 'OD700' in rep.avg.analyte_dict: + avg_growth = rep.avg.analyte_dict['OD700'].fit_params['growth_rate'].parameter_value + std_growth = rep.std.analyte_dict['OD700'].fit_params['growth_rate'].parameter_value print("\u03BC\u2090\u1D65 = %3.3f \u00B1 %3.3f /h" % (avg_growth, std_growth)) else: print("Curve fitting was not implemented for this experiment. Please check Impact settings.") diff --git a/readme.md b/readme.md index 012de95..3dd05a0 100644 --- a/readme.md +++ b/readme.md @@ -4,6 +4,8 @@ # Impact: a framework for analyzing microbial physiology +Associated with the research article: "Impact framework: A python package for writing data analysis workflows to interpret microbial physiology" https://doi.org/10.1016/j.mec.2019.e00089. + Impact assists scientists and engineers to interpret data describing microbial physiology. Impact parses raw data from common equipment such as chromatography (HPLC), and plate readers. Data is automatically interpreted and built into hierarchical objects @@ -51,4 +53,4 @@ Impact comes with scripts that test the proper functioning of the package. These The documentation is available in `docs` or a rendered version is available [here](http://impact.readthedocs.io/en/latest/) ## Starter Files -A starter ipynb which can be opened with Jupyter notebook has been provided in the Examples_and_Helpers folders. The file comes with comments which will assist users in analyzing their data. A helper file to create trial identifiers has also been provided in the Examples_and_Helpers folder. \ No newline at end of file +A starter ipynb which can be opened with Jupyter notebook has been provided in the Examples_and_Helpers folders. The file comes with comments which will assist users in analyzing their data. A helper file to create trial identifiers has also been provided in the Examples_and_Helpers folder. diff --git a/requirements.txt b/requirements.txt index 0f48b27..e892ccb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,6 @@ -sqlalchemy +sqlalchemy==1.3.23 numpy>=1.10.4 pandas scipy>=0.17.0 lmfit==0.8.3 -openpyxl==2.5.3 -tabulate -pyexcel-xlsx \ No newline at end of file +openpyxl==2.5.3 \ No newline at end of file diff --git a/requirements_plotting.txt b/requirements_plotting.txt index 441ed02..42c2b99 100644 --- a/requirements_plotting.txt +++ b/requirements_plotting.txt @@ -1,3 +1,3 @@ -plotly==2.7.0 +plotly colorlover matplotlib>=1.5.1 \ No newline at end of file