merge

Rpan4853 · Rpan4853 · commit 1df0d3a183a8 · 2025-05-16T15:01:05.000-04:00
diff --git a/notebooks/feature_engineering.ipynb b/notebooks/feature_engineering.ipynb
@@ -188,6 +188,14 @@
    "id": "7bfd56a7",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[GUIDE] Successfully performed generate_entityset.\n",
+      "\tYou can perform the next step by calling generate_label_times.\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
@@ -290,6 +298,14 @@
    "id": "e0ee16eb",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[GUIDE] Successfully performed generate_label_times.\n",
+      "\tYou can perform the next step by calling generate_feature_matrix.\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -458,7 +474,9 @@
       "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3464: RuntimeWarning: Mean of empty slice.\n",
       "  return _methods._mean(a, axis=axis, dtype=dtype,\n",
       "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/numpy/core/_methods.py:192: RuntimeWarning: invalid value encountered in scalar divide\n",
-      "  ret = ret.dtype.type(ret / rcount)\n"
+      "  ret = ret.dtype.type(ret / rcount)\n",
+      "[GUIDE] Successfully performed generate_feature_matrix.\n",
+      "\tYou can perform the next step by calling generate_train_test_split.\n"
      ]
     }
    ],
@@ -627,10 +645,10 @@
        " <Feature: MAX(pidata_processed.fft.mean.mean_value)>,\n",
        " <Feature: MIN(pidata_processed.fft.mean.mean_value)>,\n",
        " <Feature: SUM(pidata_processed.fft.mean.mean_value)>,\n",
-       " <Feature: COUNT(alarms WHERE DES_NAME = Alarm2)>,\n",
        " <Feature: COUNT(alarms WHERE DES_NAME = Alarm1)>,\n",
-       " <Feature: SUM(alarms.IND_DURATION WHERE DES_NAME = Alarm2)>,\n",
+       " <Feature: COUNT(alarms WHERE DES_NAME = Alarm2)>,\n",
        " <Feature: SUM(alarms.IND_DURATION WHERE DES_NAME = Alarm1)>,\n",
+       " <Feature: SUM(alarms.IND_DURATION WHERE DES_NAME = Alarm2)>,\n",
        " <Feature: MAX(stoppages.NUM_WORDS(DES_COMMENTS))>,\n",
        " <Feature: MAX(stoppages.NUM_WORDS(DES_DESCRIPTION))>,\n",
        " <Feature: MAX(stoppages.NUM_WORDS(DES_WO_NAME))>,\n",
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.5.dev0
+current_version = 0.0.5.dev2
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
@@ -124,6 +124,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/sintel-dev/zephyr',
-    version='0.0.5.dev0',
+    version='0.0.5.dev2',
     zip_safe=False,
 )
diff --git a/tests/test_guide.py b/tests/test_guide.py
@@ -4,16 +4,11 @@
 class DummyObject:
     def __init__(self):
         producers_and_getters = [
-            ([self.step0_key, self.step0_set], [self.step0_getter]),
-            ([self.step1_key, self.step1_set], [self.step1_getter]),
-            ([self.step2_key, self.step2_set], [self.step2_getter])
+            ([self.step0_key], [self.step0_set], [self.step0_getter]),
+            ([self.step1_key], [self.step1_set], [self.step1_getter]),
+            ([self.step2_key], [self.step2_set], [self.step2_getter])
         ]
-        set_methods = {
-            self.step0_set.__name__,
-            self.step1_set.__name__,
-            self.step2_set.__name__
-        }
-        self._guide_handler = GuideHandler(producers_and_getters, set_methods)
+        self._guide_handler = GuideHandler(producers_and_getters)
 
     @guide
     def step0_key(self):
diff --git a/zephyr_ml/__init__.py b/zephyr_ml/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dai-lab@mit.edu'
-__version__ = '0.0.5.dev0'
+__version__ = '0.0.5.dev2'
 
 import os
 
diff --git a/zephyr_ml/core.py b/zephyr_ml/core.py
@@ -30,40 +30,37 @@
 
 class GuideHandler:
 
-    def __init__(self, producers_and_getters, set_methods):
+    def __init__(self, ordered_steps):
         self.cur_iteration = 0
         self.current_step = -1
         self.start_point = -1
-        self.producers_and_getters = producers_and_getters
-        self.set_methods = set_methods
+        self.ordered_steps = ordered_steps
+        self.set_methods = set()
 
         self.producer_to_step_map = {}
         self.getter_to_step_map = {}
 
         self.iterations = []
-        for idx, (producers, getters) in enumerate(self.producers_and_getters):
+        for idx, (keys, sets, gets) in enumerate(self.ordered_steps):
             self.iterations.append(-1)
 
-            for prod in producers:
+            for prod in keys:
                 self.producer_to_step_map[prod.__name__] = idx
+            for prod in sets:
+                self.producer_to_step_map[prod.__name__] = idx
+                self.set_methods.add(prod.__name__)
 
-            for get in getters:
+            for get in gets:
                 self.getter_to_step_map[get.__name__] = idx
 
-    def get_necessary_steps(self, actual_next_step):
-        step_strs = []
-        for step in range(self.current_step, actual_next_step):
-            option_strs = []
-            for opt in self.producers_and_getters[step][0]:
-                option_strs.append(opt.__name__)
-            step_strs.append(f"{step}. {' or '.join(option_strs)}")
-        return "\n".join(step_strs)
+    def or_join(self, methods):
+        return " or ".join([method.__name__ for method in methods])
 
     def get_get_steps_in_between(self, cur_step, next_step):
         step_strs = []
         for step in range(cur_step + 1, next_step):
             step_strs.append(
-                f"{step} {self.producers_and_getters[step][1][0]}")
+                f"{step} {self.or_join(self.ordered_steps[step][2])}")
         return step_strs
 
     def get_last_up_to_date(self, next_step):
@@ -80,23 +77,25 @@ def get_steps_in_between(self, cur_step, next_step):
         step_strs = []
         for step in range(cur_step + 1, next_step):
             option_strs = []
-            for opt in self.producers_and_getters[step][0]:
-                option_strs.append(opt.__name__)
-            step_strs.append(f"{step}. {' or '.join(option_strs)}")
+            option_strs.extend(self.ordered_steps[step][0])
+            option_strs.extend(self.ordered_steps[step][1])
+            step_strs.append(f"{step}. {self.or_join(option_strs)}")
         return step_strs
 
     def log_next_producer_step(self, name):
         next_step = self.current_step + 1
 
-        if next_step >= len(self.producers_and_getters):
-            LOGGER.warning("[GUIDE] You have reached the end of the \
-                           predictive engineering workflow.\
-                           You may continue to go back and reperform steps based on results.")
+        if next_step >= len(self.ordered_steps):
+            cur_step_name = self.or_join(self.ordered_steps[self.current_step][0])
+            LOGGER.warning((f"[GUIDE] Successfully performed {name}.\n"
+                            f"\tYou have reached the end of the "
+                            f"predictive engineering workflow.\n"
+                            f"\tYou can call {cur_step_name} again or re-perform previous steps "
+                            f"based on results."))
         else:
-            next_step_name = self.producers_and_getters[next_step][0][0].__name__
-            LOGGER.warning(f"[GUIDE] Successfully performed {name}. You can perform the \
-                           next step by calling\
-                            {next_step_name}.")
+            next_step_name = self.or_join(self.ordered_steps[next_step][0])
+            LOGGER.warning(f"[GUIDE] Successfully performed {name}.\n"
+                           f"\tYou can perform the next step by calling {next_step_name}.")
 
     def perform_producer_step(self, zephyr, method,
                               *method_args, **method_kwargs):
@@ -137,21 +136,19 @@ def try_log_backwards_key_method_warning(self, name, next_step):
                         f"{steps_in_between}"))
 
     def log_get_inconsistent_warning(self, name, next_step):
-        prod_steps_str = ' or '.join([method.__name__ for method in
-                                      self.producers_and_getters[next_step][0]])
+        prod_steps_str = self.or_join(self.ordered_steps[next_step][0])
         prod_steps = f"{next_step}.{prod_steps_str}"
         latest_up_to_date = self.get_last_up_to_date(next_step)
-        LOGGER.warning(f"[GUIDE] INCONSISTENCY WARNING: Unable to perform {name} \
-                       because {prod_steps} has not \
-                        been run yet. Run steps starting at or before \
-                        {latest_up_to_date} ")
+        LOGGER.warning((f"[GUIDE] INCONSISTENCY WARNING: Unable to perform {name} because"
+                        f"{prod_steps} has not been run yet.\n"
+                        f"Run steps starting at or before {latest_up_to_date}."))
 
     def log_get_stale_warning(self, name, next_step):
         latest_up_to_date = self.get_last_up_to_date(next_step)
-        LOGGER.warning(f"[GUIDE] STALE WARNING: Performing {name}. \
-                       This data is potentially stale. \
-                        Re-run steps starting at or before \
-                        {latest_up_to_date} to ensure data is up to date.")
+        LOGGER.warning((f"[GUIDE] STALE WARNING: Performing {name}.\n"
+                        f"This data is potentially stale.\n"
+                        f"Re-run steps starting at or before {latest_up_to_date}"
+                        f"to ensure data is up to date."))
 
     # tries to perform step if possible -> warns that data might be stale
 
@@ -216,52 +213,36 @@ def try_perform_inconsistent_producer_step(  # add using stale and overwriting
         # not up to date
         if (next_step >= self.current_step and
                 self.iterations[next_step - 1] != self.cur_iteration):
-            corr_set_method = self.producers_and_getters[next_step][0][1].__name__
+            corr_set_method = self.or_join(self.ordered_steps[next_step][1])
             prev_step = next_step - 1
-            prev_set_method = self.producers_and_getters[prev_step][0][1].__name__
-            prev_key_method = self.producers_and_getters[prev_step][0][0].__name__
-            LOGGER.warning(f"[GUIDE] INCONSISTENCY WARNING:Unable \
-                           to perform {name} because you are\
-                            performing a key method at\
-                            step {next_step} but the result of the previous step, \
-                            step {prev_step}, is STALE.\
-                           If you already have the data for step {next_step}, \
-                            you can use the corresponding set method: {corr_set_method}.\
-                            Otherwise, please perform step {prev_step} \
-                                with {prev_key_method} or {prev_set_method}.")
-        # inconsistent backward step: performing set method at nonzero step
-        # elif next_step < self.current_step and name in self.set_method:
-        #     first_set_method = self.producers_and_getters[0][0][1].__name__
-        #     corr_key_method = self.producers_and_getters[next_step][0][0].__name__
-        #     LOGGER.warning(f"Unable to perform {name} because you are going backwards \
-        #                    and performing step {next_step} with a set method.\
-        #                    You can only perform a backwards step with a set \
-        #                     method at step 0: {first_set_method}.\
-        #                     If you would like to perform step {next_step}, \
-        #                         please use the corresponding key method: {corr_key_method}.")
-        # inconsistent backward step: performing key method but previous step
-        # is not up to date
+            prev_set_method = self.or_join(self.ordered_steps[prev_step][1])
+            prev_key_method = self.or_join(self.ordered_steps[prev_step][0])
+            LOGGER.warning(f"""[GUIDE] INCONSISTENCY WARNING:Unable\
+                           to perform {name} because you are performing a key method at
+                           step {next_step} but the result of the previous step,
+                           step {prev_step}, is STALE.
+                           If you already have the data for step {next_step},
+                            you can use the corresponding set method: {corr_set_method}.
+                           Otherwise, please perform step {prev_step} with
+                           {prev_key_method} or {prev_set_method}.""")
         elif (next_step < self.current_step and
               self.iterations[next_step - 1] != self.cur_iteration):
             prev_step = next_step - 1
-            prev_key_method = self.producers_and_getters[prev_step][0][0].__name__
-            corr_set_method = self.producers_and_getters[next_step][0][1].__name__
-            prev_get_method = self.producers_and_getters[prev_step][1][0].__name__
-            prev_set_method = self.producers_and_getters[prev_step][0][1].__name__
-            LOGGER.warning(f"[GUIDE] INCONSISTENCY WARNING: Unable to perform {name} \
-                           because you are going \
-                           backwards and starting a new iteration by\
-                           performing a key method at step {next_step} \
-                            but the result of the previous step,\
-                            step {prev_step}, is STALE.\
-                            If you want to use the STALE result of the PREVIOUS step, \
-                            you can call {prev_get_method} to get the data, then\
-                            {prev_set_method} to set the data, and then recall this method.\
-                            If you want to regenerate the data of the PREVIOUS step, \
-                            please call {prev_key_method}, and then recall this method.\
-                            If you already have the data for THIS step, you can \
-                            call {corr_set_method} to set the data.\
-                            ")
+            prev_key_method = self.or_join(self.ordered_steps[prev_step][0])
+            corr_set_method = self.or_join(self.ordered_steps[next_step][1])
+            prev_get_method = self.or_join(self.ordered_steps[prev_step][2])
+            prev_set_method = self.or_join(self.ordered_steps[prev_step][1])
+            LOGGER.warning(f"""[GUIDE] INCONSISTENCY WARNING: Unable to perform {name}
+                           because you are going backwards and starting a new iteration by
+                           performing a key method at step {next_step} but the result of the
+                           previous step, step {prev_step}, is STALE.
+                           If you want to use the STALE result of the PREVIOUS step,
+                           you can call {prev_get_method} to get the data, then
+                           {prev_set_method} to set the data, and then recall this method.
+                           If you want to regenerate the data of the PREVIOUS step,
+                           please call {prev_key_method}, and then recall this method.
+                           If you already have the data for THIS step, you can call
+                           {corr_set_method} to set the data.""")
 
     def try_perform_getter_step(
             self, zephyr, method, *method_args, **method_kwargs):
@@ -344,24 +325,15 @@ def __init__(self):
 
         # tuple of 2 arrays: producers and attributes
         step_order = [
-            ([
-                self.generate_entityset, self.set_entityset], [
-                self.get_entityset]), ([
-                    self.generate_label_times, self.set_label_times], [
-                    self.get_label_times]), ([
-                        self.generate_feature_matrix, self.set_feature_matrix], [
-                            self.get_feature_matrix]), ([
-                                self.generate_train_test_split, self.set_train_test_split], [
-                                    self.get_train_test_split]), ([
-                                        self.fit_pipeline, self.set_fitted_pipeline], [
-                                            self.get_fitted_pipeline]), ([
-                                                self.predict, self.evaluate], [])]
-        set_methods = set([self.set_entityset.__name__,
-                           self.set_label_times.__name__,
-                           self.set_feature_matrix.__name__,
-                           self.set_train_test_split.__name__,
-                           self.set_fitted_pipeline.__name__])
-        self._guide_handler = GuideHandler(step_order, set_methods)
+            ([self.generate_entityset], [self.set_entityset], [self.get_entityset]),
+            ([self.generate_label_times], [self.set_label_times], [self.get_label_times]),
+            ([self.generate_feature_matrix], [self.set_feature_matrix], [self.get_feature_matrix]),
+            ([self.generate_train_test_split], [self.set_train_test_split],
+                [self.get_train_test_split]),
+            ([self.fit_pipeline], [self.set_fitted_pipeline], [self.get_fitted_pipeline]),
+            ([self.predict, self.evaluate], [], [])
+            ]
+        self._guide_handler = GuideHandler(step_order)
 
     def GET_ENTITYSET_TYPES(self):
         """Get the supported entityset types and their required dataframes/columns.

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,6 @@`
`124`	`124`	`test_suite='tests',`
`125`	`125`	`tests_require=tests_require,`
`126`	`126`	`url='https://github.com/sintel-dev/zephyr',`
`127`		`- version='0.0.5.dev0',`
	`127`	`+ version='0.0.5.dev2',`
`128`	`128`	`zip_safe=False,`
`129`	`129`	`)`