diff --git a/CHANGELOG.md b/CHANGELOG.md index 76cfe037..a2ee9433 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ All notable changes to this project will be documented in this file. If you make - Modern dependency stacks now supported! - Upgrading past major versions of Scikit-Learn (1.0) and Pandas (2.0), in conjunction with their own dependencies, caused small divergences in the MonteCarloAEP analysis method with Daily GBM, and the Wake Losses analysis method with UQ. The magnitude of the differences are small compared with the magnitude of the output. - In general, OpenOA is now moving away from pinning the maximum dependency version, and will stick to defining minimum dependencies to ensure modern API usage is supported across the software. +- `utils.filters.bin_filter` was converted from a for loop to a vectorized method +- `utils.filters.bin_filter` and `utils.timeseries.percent_nan` were converted to be nearly pure NumPy methods operating on NumPy arrays for significant speedups of the TIE analysis method. +- `analysis.TurbineLongTermGrossEnergy.filter_turbine_data` was cleaned up for a minor gain in efficiency and readability. ## 3.0rc2 - Everything from release candidate 1 diff --git a/examples/03_turbine_ideal_energy.ipynb b/examples/03_turbine_ideal_energy.ipynb index 0f93b734..4a92522b 100644 --- a/examples/03_turbine_ideal_energy.ipynb +++ b/examples/03_turbine_ideal_energy.ipynb @@ -126,7 +126,7 @@ " \n", " \n", " time\n", - " WTUR_TurNam\n", + " asset_id\n", " \n", " \n", " \n", @@ -206,29 +206,29 @@ "" ], "text/plain": [ - " WROT_BlPthAngVal WTUR_W WMET_HorWdSpd \\\n", - "time WTUR_TurNam \n", - "2014-01-01 00:00:00 R80736 -1.00 642.78003 7.12 \n", - " R80721 -1.01 441.06000 6.39 \n", - " R80790 -0.96 658.53003 7.11 \n", - " R80711 -0.93 514.23999 6.87 \n", - "2014-01-01 00:10:00 R80790 -0.96 640.23999 7.01 \n", + " WROT_BlPthAngVal WTUR_W WMET_HorWdSpd \\\n", + "time asset_id \n", + "2014-01-01 00:00:00 R80736 -1.00 642.78003 7.12 \n", + " R80721 -1.01 441.06000 6.39 \n", + " R80790 -0.96 658.53003 7.11 \n", + " R80711 -0.93 514.23999 6.87 \n", + "2014-01-01 00:10:00 R80790 -0.96 640.23999 7.01 \n", "\n", - " Va_avg WMET_EnvTmp Ya_avg \\\n", - "time WTUR_TurNam \n", - "2014-01-01 00:00:00 R80736 0.66 4.69 181.34000 \n", - " R80721 -2.48 4.94 179.82001 \n", - " R80790 1.07 4.55 172.39000 \n", - " R80711 6.95 4.30 172.77000 \n", - "2014-01-01 00:10:00 R80790 -1.90 4.68 172.39000 \n", + " Va_avg WMET_EnvTmp Ya_avg WMET_HorWdDir \\\n", + "time asset_id \n", + "2014-01-01 00:00:00 R80736 0.66 4.69 181.34000 182.00999 \n", + " R80721 -2.48 4.94 179.82001 177.36000 \n", + " R80790 1.07 4.55 172.39000 173.50999 \n", + " R80711 6.95 4.30 172.77000 179.72000 \n", + "2014-01-01 00:10:00 R80790 -1.90 4.68 172.39000 170.46001 \n", "\n", - " WMET_HorWdDir energy_kwh WTUR_SupWh \n", - "time WTUR_TurNam \n", - "2014-01-01 00:00:00 R80736 182.00999 107.130005 107.130005 \n", - " R80721 177.36000 73.510000 73.510000 \n", - " R80790 173.50999 109.755005 109.755005 \n", - " R80711 179.72000 85.706665 85.706665 \n", - "2014-01-01 00:10:00 R80790 170.46001 106.706665 106.706665 " + " energy_kwh WTUR_SupWh \n", + "time asset_id \n", + "2014-01-01 00:00:00 R80736 107.130005 107.130005 \n", + " R80721 73.510000 73.510000 \n", + " R80790 109.755005 109.755005 \n", + " R80711 85.706665 85.706665 \n", + "2014-01-01 00:10:00 R80790 106.706665 106.706665 " ] }, "execution_count": 4, @@ -305,7 +305,7 @@ "output_type": "stream", "text": [ "INFO:openoa.analysis.turbine_long_term_gross_energy:Running the long term gross energy analysis\n", - "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:12<00:00, 6.32s/it]\n", + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00, 6.86s/it]\n", "INFO:openoa.analysis.turbine_long_term_gross_energy:Run completed\n" ] } @@ -540,7 +540,7 @@ "output_type": "stream", "text": [ "INFO:openoa.analysis.turbine_long_term_gross_energy:Running the long term gross energy analysis\n", - "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [16:11<00:00, 9.72s/it]\n", + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:48<00:00, 6.49s/it]\n", "INFO:openoa.analysis.turbine_long_term_gross_energy:Run completed\n" ] } @@ -566,7 +566,7 @@ { "data": { "text/plain": [ - "13571894.791458625" + "13629417.559871903" ] }, "execution_count": 13, @@ -586,7 +586,7 @@ { "data": { "text/plain": [ - "589131.3024185459" + "643649.6728541044" ] }, "execution_count": 14, @@ -608,7 +608,7 @@ "output_type": "stream", "text": [ "Mean long-term turbine ideal energy is 13.6 GWh/year\n", - "Uncertainty in long-term turbine ideal energy is 0.6 GWh/year, or 4.3% percent\n" + "Uncertainty in long-term turbine ideal energy is 0.6 GWh/year, or 4.7% percent\n" ] } ], diff --git a/openoa/analysis/turbine_long_term_gross_energy.py b/openoa/analysis/turbine_long_term_gross_energy.py index 57c6122f..2a223bb5 100644 --- a/openoa/analysis/turbine_long_term_gross_energy.py +++ b/openoa/analysis/turbine_long_term_gross_energy.py @@ -300,63 +300,49 @@ def filter_turbine_data(self) -> None: # Loop through turbines for t in self.turbine_ids: - turb_capac = dic[t]["WTUR_W"].max() + scada_df = self.scada_dict[t] + turbine_capacity_real = scada_df.WTUR_W.max() max_bin = ( - self._run.max_power_filter * turb_capac + self._run.max_power_filter * turbine_capacity_real ) # Set maximum range for using bin-filter - dic[t].dropna( + scada_df.dropna( subset=["WMET_HorWdSpd", "WTUR_SupWh"], inplace=True ) # Drop any data where scada wind speed or energy is NaN - # Flag turbine energy data less than zero - dic[t].loc[:, "flag_neg"] = filters.range_flag( - dic[t].loc[:, "WTUR_W"], lower=0, upper=turb_capac + scada_df = scada_df.assign( + flag_neg=filters.range_flag(scada_df.WTUR_W, lower=0, upper=turbine_capacity_real), + flag_range=filters.range_flag(scada_df.WMET_HorWdSpd, lower=0, upper=40), + flag_frozen=filters.unresponsive_flag(scada_df.WMET_HorWdSpd, threshold=3), + flag_window=filters.window_range_flag( + window_col=dic[t].loc[:, "WMET_HorWdSpd"], + window_start=5.0, + window_end=40, + value_col=dic[t].loc[:, "WTUR_W"], + value_min=0.02 * turbine_capacity_real, + value_max=1.2 * turbine_capacity_real, + ), + flag_bin=filters.bin_filter( + bin_col=dic[t].loc[:, "WTUR_W"], + value_col=dic[t].loc[:, "WMET_HorWdSpd"], + bin_width=0.06 * turbine_capacity_real, + threshold=self._run.wind_bin_thresh, + center_type="median", + bin_min=np.round(0.01 * turbine_capacity_real), + bin_max=np.round(max_bin), + threshold_type="std", + direction="all", + ), ) - # Apply range filter - dic[t].loc[:, "flag_range"] = filters.range_flag( - dic[t].loc[:, "WMET_HorWdSpd"], lower=0, upper=40 - ) - # Apply frozen/unresponsive sensor filter - dic[t].loc[:, "flag_frozen"] = filters.unresponsive_flag( - dic[t].loc[:, "WMET_HorWdSpd"], threshold=3 - ) - # Apply window range filter - dic[t].loc[:, "flag_window"] = filters.window_range_flag( - window_col=dic[t].loc[:, "WMET_HorWdSpd"], - window_start=5.0, - window_end=40, - value_col=dic[t].loc[:, "WTUR_W"], - value_min=0.02 * turb_capac, - value_max=1.2 * turb_capac, - ) - - threshold_wind_bin = self._run.wind_bin_thresh - # Apply bin-based filter - dic[t].loc[:, "flag_bin"] = filters.bin_filter( - bin_col=dic[t].loc[:, "WTUR_W"], - value_col=dic[t].loc[:, "WMET_HorWdSpd"], - bin_width=0.06 * turb_capac, - threshold=threshold_wind_bin, # wind bin thresh - center_type="median", - bin_min=np.round(0.01 * turb_capac), - bin_max=np.round(max_bin), - threshold_type="std", - direction="all", - ) - # Create a 'final' flag which is true if any of the previous flags are true - dic[t].loc[:, "flag_final"] = ( - (dic[t].loc[:, "flag_range"]) - | (dic[t].loc[:, "flag_window"]) - | (dic[t].loc[:, "flag_bin"]) - | (dic[t].loc[:, "flag_frozen"]) + self.scada_dict[t].loc[:, "flag_final"] = ( + scada_df.flag_range + | scada_df.flag_window + | scada_df.flag_bin + | scada_df.flag_frozen ) - # Set negative turbine data to zero - dic[t].loc[dic[t]["flag_neg"], "WTUR_W"] = 0 - def setup_daily_reanalysis_data(self) -> None: """ Process reanalysis data to daily means for later use in the GAM model. diff --git a/openoa/utils/filters.py b/openoa/utils/filters.py index 3fc22ceb..ccd93ec6 100644 --- a/openoa/utils/filters.py +++ b/openoa/utils/filters.py @@ -147,8 +147,8 @@ def std_range_flag( raise ValueError("The inputs to `col` and `threshold` must be the same length.") subset = data.loc[:, col].copy() - data_mean = subset.mean(axis=0) - data_std = subset.std(axis=0) * np.array(threshold) + data_mean = np.nanmean(subset.values, axis=0) + data_std = np.nanstd(subset.values, ddof=1, axis=0) * np.array(threshold) flag = subset.le(data_mean - data_std) | subset.ge(data_mean + data_std) # Return back a pd.Series if one was provided, else a pd.DataFrame @@ -232,9 +232,9 @@ def bin_filter( # Set bin min and max values if not passed to function if bin_min is None: - bin_min = bin_col.min() + bin_min = np.min(bin_col.values) if bin_max is None: - bin_max = bin_col.max() + bin_max = np.max(bin_col.values) # Define bin edges bin_edges = np.arange(bin_min, bin_max, bin_width) @@ -242,37 +242,48 @@ def bin_filter( # Ensure the last bin edge value is bin_max bin_edges = np.unique(np.clip(np.append(bin_edges, bin_max), bin_min, bin_max)) - # Define empty flag of 'False' values with indices matching value_col - flag = pd.Series(index=value_col.index, data=False) - - # Loop through bins and applying flagging - nbins = len(bin_edges) - for i in range(nbins - 1): - # Get data that fall wihtin bin - y_bin = value_col.loc[(bin_col <= bin_edges[i + 1]) & (bin_col > bin_edges[i])] - - # Get center of binned data - center = y_bin.mean() if center_type == "mean" else y_bin.median() - - # Define threshold of data flag - if threshold_type == "std": - deviation = y_bin.std() * threshold - elif threshold_type == "scalar": - deviation = threshold - else: # median absolute deviation (mad) - deviation = (y_bin - center).abs().median() * threshold - - # Perform flagging depending on specfied direction - if direction == "above": - flag_bin = y_bin > (center + deviation) - elif direction == "below": - flag_bin = y_bin < (center - deviation) - else: - flag_bin = (y_bin > (center + deviation)) | (y_bin < (center - deviation)) - - # Record flags in final flag column - flag.loc[flag_bin.index] = flag_bin - + # Bin the data and recreate the comparison data as a multi-column data frame + which_bin_col = np.digitize(bin_col, bin_edges, right=True) + + # Create the flag values as a matrix with each column being the timestamp's binned value, + # e.g., all columns values are NaN if the data point is not in that bin + flag_vals = ( + value_col.to_frame().set_index(pd.Series(which_bin_col, name="bin"), append=True).unstack() + ) + drop = [i for i, el in enumerate(flag_vals.columns.names) if el != "bin"] + flag_vals.columns = flag_vals.columns.droplevel(drop).rename(None) + + # Create a False array as default, so flags are set to True + flag_df = pd.DataFrame(np.zeros_like(flag_vals, dtype=bool), index=flag_vals.index) + + # Get center of binned data + if center_type == "median": + center = np.nanmedian(flag_vals.values, axis=0) + else: + center = np.nanmean(flag_vals.values, axis=0) + center = pd.DataFrame( + np.full(flag_vals.shape, center), + index=flag_vals.index, + columns=flag_vals.columns, + ) + + # Define threshold of data flag + if threshold_type == "std": + deviation = np.nanstd(flag_vals.values, ddof=1, axis=0) * threshold + elif threshold_type == "scalar": + deviation = threshold + else: # median absolute deviation (mad) + deviation = np.nanmedian(np.abs(flag_vals.values - center.values), axis=0) * threshold + + # Perform flagging depending on specfied direction + if direction in ("above", "all"): + flag_df |= flag_vals > center + deviation + if direction in ("below", "all"): + flag_df |= flag_vals < center - deviation + + # Get all instances where the value is True, and reset any values outside the bin limits + flag = pd.Series(np.nanmax(flag_df, axis=1), index=flag_df.index) + flag.loc[(bin_col <= bin_min) | (bin_col > bin_max)] = False return flag diff --git a/openoa/utils/timeseries.py b/openoa/utils/timeseries.py index 8712d974..38316efe 100644 --- a/openoa/utils/timeseries.py +++ b/openoa/utils/timeseries.py @@ -227,7 +227,7 @@ def percent_nan(col: pd.Series | str, data: pd.DataFrame = None): Returns: :obj:`float`: Percentage of NaN data in the data series """ - return 1 if (denominator := float(col.size)) == 0 else col.isnull().sum() / denominator + return 1 if (denominator := float(col.size)) == 0 else np.isnan(col.values).sum() / denominator @series_method(data_cols=["dt_col"]) diff --git a/test/unit/test_timeseries_toolkit.py b/test/unit/test_timeseries_toolkit.py index a8e18f60..36d66e48 100644 --- a/test/unit/test_timeseries_toolkit.py +++ b/test/unit/test_timeseries_toolkit.py @@ -143,9 +143,11 @@ def test_num_hours(self): def test_percent_nan(self): test_dict = {} - test_dict["a"] = pd.Series(["", 1, 2, 1e5, np.Inf]) - test_dict["b"] = pd.Series(["", np.nan, 2, 1e5, np.Inf]) - test_dict["c"] = pd.Series([np.nan, 1, 2, 1e5, np.nan]) + + # All should be float Series given PlantData requirements + test_dict["a"] = pd.Series([True, 1, 2, 1e5, np.Inf]).astype(float) + test_dict["b"] = pd.Series([False, np.nan, 2, 1e5, np.Inf]).astype(float) + test_dict["c"] = pd.Series([np.nan, 1, 2, 1e5, np.nan]).astype(float) nan_values = {"a": 0.0, "b": 0.2, "c": 0.4}