LMSE · kraj593 · May 8, 2020 · May 8, 2020 · May 12, 2020 · May 12, 2020
diff --git a/impact/core/Experiment.py b/impact/core/Experiment.py
@@ -1,17 +1,9 @@
-import sqlite3 as sql
 import time
 
 from .AnalyteData import TimeCourse, Biomass, Product, Substrate, Reporter
 from .ReplicateTrial import ReplicateTrial
 from .SingleTrial import SingleTrial
 
-try:
-    from pyexcel_xlsx import get_data
-except ImportError as e:
-    print('Could not import pyexcel')
-    print(e)
-    pass
-
 from ..database import Base
 from sqlalchemy import Column, Integer, ForeignKey, Float, Date, String, event
 from sqlalchemy.orm import relationship
@@ -140,15 +132,21 @@ def calculate(self):
                     repstage.calculate()
         print("Ran analysis in %0.1fs\n" % ((time.time() - t0)))
 
-        if settings.perform_curve_fit and 'OD600' in self.analyte_names:
+        biomass_analyte = None
+        if 'OD600' in self.analyte_names:
+            biomass_analyte = 'OD600'
+        elif 'OD700' in self.analyte_names:
+            biomass_analyte = 'OD700'
+
+        if settings.perform_curve_fit and biomass_analyte:
             rep_list = [rep for rep in self.replicate_trials if
                         rep.trial_identifier.strain.name.lower() not in ['blank', 'none']]
             rep_list = sorted(rep_list, key=lambda rep: str(rep.trial_identifier))
             avg_list = []
             error_list = []
             for rep in rep_list:
-                avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
-                std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
+                avg_growth = rep.avg.analyte_dict[biomass_analyte].fit_params['growth_rate'].parameter_value
+                std_growth = rep.std.analyte_dict[biomass_analyte].fit_params['growth_rate'].parameter_value
                 avg_list.append(avg_growth)
                 error_list.append(std_growth / avg_growth * 100)
             max_growth_rate = max(avg_list)

diff --git a/impact/core/ReplicateTrial.py b/impact/core/ReplicateTrial.py
@@ -228,10 +228,9 @@ def calculate_statistics(self):
             self.avg.analyte_dict[analyte].pd_series = self.replicate_df[analyte].mean(axis=1)
             self.std.analyte_dict[analyte].pd_series = self.replicate_df[analyte].std(axis=1)
 
-            #This is the right way to calculate standard deviations for blank subtraction. You must add the two variances
             if self.blank_subtraction_flag and analyte in self.blank_subtracted_analytes:
-                self.std.analyte_dict[analyte].pd_series = np.sqrt(np.square(self.std.analyte_dict[analyte].pd_series)
-                                                            + np.square(self.blank.std.analyte_dict[analyte].pd_series))
+               self.std.analyte_dict[analyte].pd_series = np.sqrt(np.square(self.std.analyte_dict[analyte].pd_series)
+                                                           + np.square(self.blank.std.analyte_dict[analyte].pd_series))
             #Assume that stdev for all values <0 is simply 0 since negative values are forced to be 0.
             #Negative values of any analyte in this context is not possible
             #self.std.analyte_dict[analyte].pd_series[self.avg.analyte_dict[analyte].pd_series<=0] = 0
@@ -257,13 +256,13 @@ def calculate_statistics(self):
                             if feature_data is not None:
                                 single_trial_data.append(feature_data)
                             if self.blank:
-                                with np.errstate(divide='ignore'):
+                               with np.errstate(divide='ignore'):
 
                                     temp_var = np.square(feature_data)*(np.square(self.blank.std.analyte_dict[analyte].pd_series\
 
-                                               /trial.analyte_dict[analyte].pd_series)+
+                                               /trial.analyte_dict[analyte].data_vector)+
                                                 np.square(self.blank.std.analyte_dict[biomass_analyte].pd_series
-                                                          /trial.analyte_dict[biomass_analyte].pd_series))
+                                                          /trial.analyte_dict[biomass_analyte].data_vector))
                                     temp_var[trial.analyte_dict[analyte].pd_series == 0] = 0
                                     temp_var[trial.analyte_dict[biomass_analyte].pd_series == 0] = 0
                                     single_trial_var.append(temp_var)
@@ -277,7 +276,7 @@ def calculate_statistics(self):
                         # Variance on dataset due to blanks is average of individual standard deviation squared.
                         # Total variance is variance due to blanks + variance between individual normalized datapoints
                         if self.blank:
-                            rep_var = sum(single_trial_var)/np.square(len(single_trial_var)) + rep_var
+                           rep_var = sum(single_trial_var)/np.square(len(single_trial_var)) + rep_var
 
                         setattr(self.std.analyte_dict[analyte], feature.name, np.sqrt(rep_var).values)
                         setattr(self.avg.analyte_dict[analyte], feature.name, rep_mean)
@@ -379,7 +378,7 @@ def prune_bad_replicates(self, analyte):  # Remove outliers
                     # A value between 0 and 1, > 1 means removing the replicate makes the yield worse
 
                     # df = pd.DataFrame({key: np.random.randn(5) for key in ['a', 'b', 'c']})
-                    std_by_mean = np.mean(abs(backup.std(axis = 1)/backup.mean(axis = 1)))
+                    std_by_mean = np.mean(abs(df.std(axis=1)/df.mean(axis=1)))
                     temp_std_by_mean = {}
                     for temp_remove_replicate in list(df.columns.values):
                         indices = [replicate for i, replicate in enumerate(df.columns.values) if
@@ -390,7 +389,7 @@ def prune_bad_replicates(self, analyte):  # Remove outliers
                         temp_std_by_mean[temp_remove_replicate] = np.mean(abs(temp_std / temp_mean))
 
                     temp_min_val = min([temp_std_by_mean[key] for key in temp_std_by_mean])
-                    if temp_min_val < std_deviation_cutoff and temp_min_val < std_by_mean:
+                    if abs(temp_min_val-std_by_mean) >= std_deviation_cutoff and temp_min_val < std_by_mean:
                         bad_replicate_cols.append(
                             [key for key in temp_std_by_mean if temp_std_by_mean[key] == temp_min_val][0])
 
@@ -425,13 +424,13 @@ def substract_blank(self):
         for single_trial in self.single_trials:
             for blank_analyte in analytes_with_blanks:
                 if blank_analyte in single_trial.analyte_dict:
-                    single_trial.analyte_dict[blank_analyte].data_vector = \
-                        single_trial.analyte_dict[blank_analyte].data_vector \
-                        - self.blank.avg.analyte_dict[blank_analyte].data_vector
-
+                    single_trial.analyte_dict[blank_analyte].pd_series = \
+                        single_trial.analyte_dict[blank_analyte].pd_series \
+                        - self.blank.avg.analyte_dict[blank_analyte].pd_series
+                    single_trial.analyte_dict[blank_analyte].generate_time_point_list()
                     #single_trial.analyte_dict[blank_analyte].data_vector = \
                     #    single_trial.analyte_dict[blank_analyte].data_vector.clip(min = 0)
-                    self.blank_subtracted_analytes.append(blank_analyte)
+                self.blank_subtracted_analytes.append(blank_analyte)
         self.blank_subtracted_analytes = list(set(self.blank_subtracted_analytes))
         self.blank_subtraction_flag = True
 

diff --git a/impact/core/analytes/Base.py b/impact/core/analytes/Base.py
@@ -245,6 +245,8 @@ def calculate(self):
     def find_death_phase(self, data_vector):
         from ..settings import settings
         use_filtered_data = settings.use_filtered_data
+        if len(self.pd_series) < self.savgol_filter_window_size:
+            use_filtered_data = False
         verbose = settings.verbose
         self.death_phase_start = self.find_death_phase_static(data_vector,
                                                               use_filtered_data = use_filtered_data,
@@ -312,7 +314,6 @@ def add_timepoint(self, time_point):
         self.time_points.append(time_point)
         if len(self.time_points) == 1:
             self.trial_identifier = time_point.trial_identifier
-            self.pd_series = pd.Series([time_point.data],index=[time_point.time])
         else:
             if self.time_points[-1].trial_identifier.unique_single_trial() \
                     != self.time_points[-2].trial_identifier.unique_single_trial():
@@ -326,11 +327,7 @@ def add_timepoint(self, time_point):
                 self.time_points.sort(key=lambda timePoint: timePoint.time)
 
 
-        if sum(self.pd_series.index.duplicated()) > 0:
-            print(self.pd_series)
-            print(self.trial_identifier)
-            print(time_point.trial_identifier)
-            raise Exception('Duplicate time points found, this is not supported - likely an identifier input error')
+
 
     def curve_fit_data(self):
         raise Exception('This must be implemented in a child')

diff --git a/impact/core/analytes/Product.py b/impact/core/analytes/Product.py
@@ -15,7 +15,5 @@ def curve_fit_data(self):
         from ..settings import settings
         verbose = settings.verbose
 
-        if self.trial_identifier.analyte_type == 'product':
-            raise Exception('Product curve fitting not implemented')
-        else:
+        if self.trial_identifier.analyte_type != 'product':
             raise Exception('Incorrect analyte_type')
diff --git a/impact/helpers/synthetic_data.py b/impact/helpers/synthetic_data.py
@@ -35,11 +35,11 @@ def generate_data(y0, t, model, biomass_keys, substrate_keys, product_keys, nois
     sol = model.optimize()
 
     # Let's assign the data to these variables
-    biomass_flux = [sol.x_dict[biomass_keys[0]]]
+    biomass_flux = [sol.fluxes[biomass_keys[0]]]
 
-    substrate_flux = [sol.x_dict[substrate_keys[0]]]
+    substrate_flux = [sol.fluxes[substrate_keys[0]]]
 
-    product_flux = [sol.x_dict[key] for key in product_keys]
+    product_flux = [sol.fluxes[key] for key in product_keys]
 
     exchange_keys = biomass_keys + substrate_keys + product_keys
 
@@ -96,11 +96,11 @@ def dFBA_functions(y, t, t_max, model, analyte_dict, bar):
             return [0]*len(exchange_keys)
         else:
             # Let's assign the data to these variables
-            biomass_flux = [model.solution.x_dict[biomass_keys[0]]]
+            biomass_flux = [solution.fluxes[biomass_keys[0]]]
 
-            substrate_flux = [model.solution.x_dict[substrate_keys[0]]]
+            substrate_flux = [solution.fluxes[substrate_keys[0]]]
 
-            product_flux = [model.solution.x_dict[key] for key in product_keys]
+            product_flux = [solution.fluxes[key] for key in product_keys]
 
             exchange_keys = biomass_keys + substrate_keys + product_keys
 

diff --git a/impact/parsers.py b/impact/parsers.py
@@ -357,7 +357,8 @@ def parse_raw_data(cls, data_format=None, id_type='traverse', file_name=None, da
 
             # Extract data from sheets
             data = {sheet.title: [[elem.value if elem is not None else None for elem in row]
-                                  for row in xls_data[sheet.title]] for sheet in xls_data}
+                                  for row in xls_data[sheet.title]]
+                                  if xls_data[sheet.title].max_row > 1 else None for sheet in xls_data}
 
         if data_format in cls.parser_case_dict.keys():
             cls.parser_case_dict[data_format](experiment, data=data, id_type=id_type, plate_type = plate_type)
@@ -415,7 +416,8 @@ def parse_raw_data(format=None, id_type='CSV', file_name=None, data=None, experi
 
         # Extract data from sheets
         data = {sheet.title: [[elem.value if elem is not None else None for elem in row]
-                              for row in xls_data[sheet.title]] for sheet in xls_data}
+                              for row in xls_data[sheet.title]]
+                              if xls_data[sheet.title].max_row > 1 else None for sheet in xls_data}
 
     # Import parsers
     parser_case_dict = {'spectromax_OD'           : spectromax_OD,
@@ -473,6 +475,11 @@ def parse_time_point_list(time_point_list):
     for analyte in analyte_dict.values():
         analyte.pd_series = pd.Series([timePoint.data for timePoint in analyte.time_points],
                                            index=[timePoint.time for timePoint in analyte.time_points])
+        if sum(analyte.pd_series.index.duplicated()) > 0:
+            print(analyte.pd_series)
+            print(analyte.trial_identifier)
+            print(analyte.pd_series[analyte.pd_series.index.duplicated()])
+            raise Exception('Duplicate time points found, this is not supported - likely an identifier input error')
 
     tf = sys_time.time()
     print("Parsed %i time points in %0.1fs" % (len(time_point_list), (tf - t0)))

diff --git a/impact/plotting.py b/impact/plotting.py
@@ -44,7 +44,6 @@
 
 from plotly import tools
 import plotly.graph_objs as go
-import plotly.plotly as py
 import colorlover as cl
 import math
 
@@ -291,7 +290,7 @@ def printGenericTimeCourse_plotly(replicateTrialList=None, dbName=None, strainsT
                 else:
                     y_avg = [replicate.avg.analyte_dict[product].data_vector for replicate in replicateTrialList
                              if getattr(replicate.trial_identifier, sortBy) == unique or sort_by_flag is False]
-                    y_std = [replicate.std.analyte_dict[product].data_vector for replicate in replicateTrialList
+                    y_std = [replicate.std.analyte_dict[product].pd_series.values for replicate in replicateTrialList
                              if getattr(replicate.trial_identifier, sortBy) == unique or sort_by_flag is False]
                     label = ' titer (g/L)'
 
@@ -434,7 +433,6 @@ def render_output_ploty(output_type, fig, number_of_columns=None, column_width_m
         plotly.tools.set_credentials_file(username=plotly_username, api_key=plotly_api_key)
         fig['layout'].update(width=number_of_columns * column_width_multiplier)
         random_file_name = ''.join(random.choice(string.ascii_letters) for _ in range(10)) + '.png'
-        py.image.save_as(fig, random_file_name, scale=img_scale)
         return random_file_name
 
 
@@ -491,7 +489,7 @@ def print_generic_timecourse_plotly(replicate_trial_list, product, colors, pts_p
             if product != 'OD600':
                 dataLabel = '<br>titer (g/L)'
             y_avg = replicate.avg.analyte_dict[product].data_vector[::removePointFraction]
-            y_std = replicate.std.analyte_dict[product].data_vector[::removePointFraction]
+            y_std = replicate.std.analyte_dict[product].pd_series.values[::removePointFraction]
         elif normalize_to is not None:
             y_avg = replicate.avg.analyte_dict[product].get_normalized_data(normalize_to)[
                     ::removePointFraction]
@@ -661,7 +659,7 @@ def time_profile_traces(replicate_trials=None, feature=None, analyte='OD600', co
 
     if colors is None:
         colors = get_colors(len(replicate_trials), colors=colors, cl_scales=cl_scales)
-
+    replicate_trials = sorted(replicate_trials, key=lambda rep: str(rep.trial_identifier))
     for index, replicate in enumerate(replicate_trials):
         # Determine how many points should be plotted
         required_num_pts = replicate.t[-1] * pts_per_hour
@@ -672,7 +670,7 @@ def time_profile_traces(replicate_trials=None, feature=None, analyte='OD600', co
                                  y=replicate.avg.analyte_dict[analyte].data_vector[::removePointFraction],
                                  error_y={
                                      'type'   : 'data',
-                                     'array'  : replicate.std.analyte_dict[analyte].data_vector[::removePointFraction],
+                                     'array'  : replicate.std.analyte_dict[analyte].pd_series.values[::removePointFraction],
                                      'visible': True,
                                      'color'  : colors[index]},
                                  # mode=mode,
@@ -735,7 +733,7 @@ def analyte_bar_trace(replicate_trials=None, feature=None, analyte='OD600', colo
                 print("That is not a valid point")
                 return None
             data_point = replicate.avg.analyte_dict[analyte].data_vector[index_to_plot]
-            error = replicate.std.analyte_dict[analyte].data_vector[index_to_plot]
+            error = replicate.std.analyte_dict[analyte].pd_series.values[index_to_plot]
 
             x_list.append(str(replicate.trial_identifier.strain) +" in " + str(replicate.trial_identifier.media))
             y_list.append(data_point)
@@ -1407,8 +1405,12 @@ def plot_growth_curve_fit(expt=None, format=None):
             fig['layout'].update(title='Growth curve fit for ' + str(rep.trial_identifier))
             fig['layout']['yaxis1'].update(title='OD600')
             plot(fig, image=format)
-            avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
-            std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
+            if 'OD600' in rep.avg.analyte_dict:
+                avg_growth = rep.avg.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
+                std_growth = rep.std.analyte_dict['OD600'].fit_params['growth_rate'].parameter_value
+            elif 'OD700' in rep.avg.analyte_dict:
+                avg_growth = rep.avg.analyte_dict['OD700'].fit_params['growth_rate'].parameter_value
+                std_growth = rep.std.analyte_dict['OD700'].fit_params['growth_rate'].parameter_value
             print("\u03BC\u2090\u1D65 = %3.3f \u00B1 %3.3f /h" % (avg_growth, std_growth))
     else:
         print("Curve fitting was not implemented for this experiment. Please check Impact settings.")

diff --git a/readme.md b/readme.md
@@ -4,6 +4,8 @@
 
 # Impact: a framework for analyzing microbial physiology
 
+Associated with the research article: "Impact framework: A python package for writing data analysis workflows to interpret microbial physiology" https://doi.org/10.1016/j.mec.2019.e00089.
+
 Impact assists scientists and engineers to interpret data describing microbial physiology.
 Impact parses raw data from common equipment such as chromatography (HPLC), 
 and plate readers. Data is automatically interpreted and built into hierarchical objects
@@ -51,4 +53,4 @@ Impact comes with scripts that test the proper functioning of the package. These
 The documentation is available in `docs` or a rendered version is available [here](http://impact.readthedocs.io/en/latest/)
 
 ## Starter Files
-A starter ipynb which can be opened with Jupyter notebook has been provided in the Examples_and_Helpers folders. The file comes with comments which will assist users in analyzing their data. A helper file to create trial identifiers has also been provided in the Examples_and_Helpers folder.
+A starter ipynb which can be opened with Jupyter notebook has been provided in the Examples_and_Helpers folders. The file comes with comments which will assist users in analyzing their data. A helper file to create trial identifiers has also been provided in the Examples_and_Helpers folder.
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,6 @@
-sqlalchemy
+sqlalchemy==1.3.23
 numpy>=1.10.4
 pandas
 scipy>=0.17.0
 lmfit==0.8.3
-openpyxl==2.5.3
-tabulate
-pyexcel-xlsx
+openpyxl==2.5.3
diff --git a/requirements_plotting.txt b/requirements_plotting.txt
@@ -1,3 +1,3 @@
-plotly==2.7.0
+plotly
 colorlover
 matplotlib>=1.5.1