polygon-io · Nov 5, 2024
diff --git a/‎examples/tools/hunting-anomalies/README.md
+49 b/‎examples/tools/hunting-anomalies/README.md
+49
diff --git a/‎examples/tools/hunting-anomalies/aggregates_day/README.md
+1 b/‎examples/tools/hunting-anomalies/aggregates_day/README.md
+1
diff --git a/‎examples/tools/hunting-anomalies/build-lookup-table.py
+91 b/‎examples/tools/hunting-anomalies/build-lookup-table.py
+91
diff --git a/‎examples/tools/hunting-anomalies/gui-lookup-table.py
+302 b/‎examples/tools/hunting-anomalies/gui-lookup-table.py
+302
diff --git a/‎examples/tools/hunting-anomalies/query-lookup-table.py
+63 b/‎examples/tools/hunting-anomalies/query-lookup-table.py
+63
@@ -0,0 +1,49 @@
+# Hunting Anomalies in the Stock Market
+
+This repository contains all the necessary scripts and data directories used in the [Hunting Anomalies in the Stock Market](https://polygon.io/blog/hunting-anomalies-in-stock-market/) tutorial, hosted on Polygon.io's blog. The tutorial demonstrates how to detect statistical anomalies in historical US stock market data through a comprehensive workflow that involves downloading data, building a lookup table, querying for anomalies, and visualizing them through a web interface.
+
+### Prerequisites
+
+- Python 3.8+
+- Access to Polygon.io's historical data via Flat Files
+- An active Polygon.io API key, obtainable by signing up for a Stocks paid plan
+
+### Repository Contents
+
+- `README.md`: This file, outlining setup and execution instructions.
+- `aggregates_day`: Directory where downloaded CSV data files are stored.
+- `build-lookup-table.py`: Python script to build a lookup table from the historical data.
+- `query-lookup-table.py`: Python script to query the lookup table for anomalies.
+- `gui-lookup-table.py`: Python script for a browser-based interface to explore anomalies visually.
+
+### Running the Tutorial
+
+1. **Ensure Python 3.8+ is installed:** Check your Python version and ensure all required libraries (polygon-api-client, pandas, pickle, and argparse) are installed.
+
+2. **Set up your API key:** Make sure you have an active paid Polygon.io Stock subscription for accessing Flat Files. Set up your API key in your environment or directly in the scripts where required.
+
+3. **Download Historical Data:** Use the MinIO client to download historical stock market data. Adjust the commands and paths based on the data you are interested in.
+   ```bash
+   mc alias set s3polygon https://files.polygon.io YOUR_ACCESS_KEY YOUR_SECRET_KEY
+   mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/08/ ./aggregates_day/
+   mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/09/ ./aggregates_day/
+   mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/10/ ./aggregates_day/
+   gunzip ./aggregates_day/*.gz
+   ```
+
+4. **Build the Lookup Table:** This script processes the downloaded data and builds a lookup table, saving it as `lookup_table.pkl`.
+   ```bash
+   python build-lookup-table.py
+   ```
+
+5. **Query Anomalies:** Replace `2024-10-18` with the date you want to analyze for anomalies.
+   ```bash
+   python query-lookup-table.py 2024-10-18
+   ```
+
+6. **Run the GUI:** Access the web interface at `http://localhost:8888` to explore the anomalies visually.
+   ```bash
+   python gui-lookup-table.py
+   ```
+
+For a complete step-by-step guide on each phase of the anomaly detection process, including additional configurations and troubleshooting, refer to the detailed [tutorial on our blog](https://polygon.io/blog/hunting-anomalies-in-stock-market/).
@@ -0,0 +1 @@
+Download flat files into here.
@@ -0,0 +1,91 @@
+import os
+import pandas as pd  # type: ignore
+from collections import defaultdict
+import pickle
+import json
+from typing import DefaultDict, Dict, Any, BinaryIO
+
+# Directory containing the daily CSV files
+data_dir = "./aggregates_day/"
+
+# Initialize a dictionary to hold trades data
+trades_data = defaultdict(list)
+
+# List all CSV files in the directory
+files = sorted([f for f in os.listdir(data_dir) if f.endswith(".csv")])
+
+print("Starting to process files...")
+
+# Process each file (assuming files are named in order)
+for file in files:
+    print(f"Processing {file}")
+    file_path = os.path.join(data_dir, file)
+    df = pd.read_csv(file_path)
+    # For each stock, store the date and relevant data
+    for _, row in df.iterrows():
+        ticker = row["ticker"]
+        date = pd.to_datetime(row["window_start"], unit="ns").date()
+        trades = row["transactions"]
+        close_price = row["close"]  # Ensure 'close' column exists in your CSV
+        trades_data[ticker].append(
+            {"date": date, "trades": trades, "close_price": close_price}
+        )
+
+print("Finished processing files.")
+print("Building lookup table...")
+
+# Now, build the lookup table with rolling averages and percentage price change
+lookup_table: DefaultDict[str, Dict[str, Any]] = defaultdict(
+    dict
+)  # Nested dict: ticker -> date -> stats
+
+for ticker, records in trades_data.items():
+    # Convert records to DataFrame
+    df_ticker = pd.DataFrame(records)
+    # Sort records by date
+    df_ticker.sort_values("date", inplace=True)
+    df_ticker.set_index("date", inplace=True)
+
+    # Calculate the percentage change in close_price
+    df_ticker["price_diff"] = (
+        df_ticker["close_price"].pct_change() * 100
+    )  # Multiply by 100 for percentage
+
+    # Shift trades to exclude the current day from rolling calculations
+    df_ticker["trades_shifted"] = df_ticker["trades"].shift(1)
+    # Calculate rolling average and standard deviation over the previous 5 days
+    df_ticker["avg_trades"] = df_ticker["trades_shifted"].rolling(window=5).mean()
+    df_ticker["std_trades"] = df_ticker["trades_shifted"].rolling(window=5).std()
+    # Store the data in the lookup table
+    for date, row in df_ticker.iterrows():
+        # Convert date to string for JSON serialization
+        date_str = date.strftime("%Y-%m-%d")
+        # Ensure rolling stats are available
+        if pd.notnull(row["avg_trades"]) and pd.notnull(row["std_trades"]):
+            lookup_table[ticker][date_str] = {
+                "trades": row["trades"],
+                "close_price": row["close_price"],
+                "price_diff": row["price_diff"],
+                "avg_trades": row["avg_trades"],
+                "std_trades": row["std_trades"],
+            }
+        else:
+            # Store data without rolling stats if not enough data points
+            lookup_table[ticker][date_str] = {
+                "trades": row["trades"],
+                "close_price": row["close_price"],
+                "price_diff": row["price_diff"],
+                "avg_trades": None,
+                "std_trades": None,
+            }
+
+print("Lookup table built successfully.")
+
+# Convert defaultdict to regular dict for JSON serialization
+lookup_table_dict = {k: v for k, v in lookup_table.items()}
+
+# Save the lookup table to a file for later use
+with open("lookup_table.pkl", "wb") as f:  # type: BinaryIO
+    pickle.dump(lookup_table_dict, f)
+
+print("Lookup table saved to 'lookup_table.pkl'.")
@@ -0,0 +1,302 @@
+import os
+import pickle
+import json
+from datetime import datetime
+from polygon import RESTClient
+from polygon.rest.models import Agg
+import http.server
+import socketserver
+import traceback
+from urllib.parse import urlparse, parse_qs
+
+PORT = 8888
+
+# Load the lookup_table
+with open("lookup_table.pkl", "rb") as f:
+    lookup_table = pickle.load(f)
+
+
+class handler(http.server.SimpleHTTPRequestHandler):
+    def do_GET(self):
+        # Parse the path and query parameters
+        parsed_path = urlparse(self.path)
+        path = parsed_path.path
+        query_params = parse_qs(parsed_path.query)
+
+        if path == "/":
+            # Handle the root path
+            # Get the date parameter if provided
+            date_param = query_params.get("date", [None])[0]
+
+            # Get all dates from the lookup table
+            all_dates = set()
+            for ticker_data in lookup_table.values():
+                all_dates.update(ticker_data.keys())
+            all_dates = sorted(all_dates)
+
+            # If date is None, get the latest date from the lookup table
+            if date_param is None:
+                if all_dates:
+                    latest_date = max(all_dates)
+                else:
+                    self.send_response(200)
+                    self.send_header("Content-type", "text/html")
+                    self.end_headers()
+                    html_content = (
+                        "<html><body><h1>No data available.</h1></body></html>"
+                    )
+                    self.wfile.write(html_content.encode())
+                    return
+            else:
+                latest_date = date_param
+
+            # Ensure latest_date is in all_dates
+            if latest_date not in all_dates:
+                # Handle the case where the provided date is invalid
+                self.send_response(400)
+                self.send_header("Content-type", "text/html")
+                self.end_headers()
+                error_html = f"<html><body><h1>Error: No data available for date {latest_date}</h1></body></html>"
+                self.wfile.write(error_html.encode())
+                return
+
+            # Now, get the anomalies for the latest_date
+            anomalies = []
+            for ticker, date_data in lookup_table.items():
+                if latest_date in date_data:
+                    data = date_data[latest_date]
+                    trades = data["trades"]
+                    avg_trades = data["avg_trades"]
+                    std_trades = data["std_trades"]
+                    if (
+                        avg_trades is not None
+                        and std_trades is not None
+                        and std_trades > 0
+                    ):
+                        z_score = (trades - avg_trades) / std_trades
+                        threshold_multiplier = 3  # Adjust as needed
+                        if z_score > threshold_multiplier:
+                            anomalies.append(
+                                {
+                                    "ticker": ticker,
+                                    "date": latest_date,
+                                    "trades": trades,
+                                    "avg_trades": avg_trades,
+                                    "std_trades": std_trades,
+                                    "z_score": z_score,
+                                    "close_price": data["close_price"],
+                                    "price_diff": data["price_diff"],
+                                }
+                            )
+            # Sort anomalies by trades in descending order
+            anomalies.sort(key=lambda x: x["trades"], reverse=True)
+            # Generate the HTML to display the anomalies
+            self.send_response(200)
+            self.send_header("Content-type", "text/html")
+            self.end_headers()
+            # Build the HTML content
+            html_content = '<html><link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/tablesort/5.2.1/tablesort.min.js" integrity="sha512-F/gIMdDfda6OD2rnzt/Iyp2V9JLHlFQ+EUyixDg9+rkwjqgW1snpkpx7FD5FV1+gG2fmFj7I3r6ReQDUidHelA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/tablesort/5.2.1/sorts/tablesort.number.min.js" integrity="sha512-dRD755QRxlybm0h3LXXIGrFcjNakuxW3reZqnPtUkMv6YsSWoJf+slPjY5v4lZvx2ss+wBZQFegepmA7a2W9eA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script><head><title>Anomalies for {}</title></head><body>'.format(
+                latest_date
+            )
+            html_content += '<div id="container" style="padding:4px;"><h1>Anomalies for {}</h1>'.format(
+                latest_date
+            )
+            # Add navigation links (prev and next dates)
+            current_index = all_dates.index(latest_date)
+            prev_date = all_dates[current_index - 1] if current_index > 0 else None
+            next_date = (
+                all_dates[current_index + 1]
+                if current_index < len(all_dates) - 1
+                else None
+            )
+            html_content += "<p>"
+            if prev_date:
+                html_content += '<a href="/?date={}">Previous Date</a> '.format(
+                    prev_date
+                )
+            if next_date:
+                html_content += '<a href="/?date={}">Next Date</a> '.format(next_date)
+            html_content += "</p>"
+            # Display the anomalies in a table
+            html_content += (
+                '<table id="anomalies" class="table table-striped table-hover">'
+            )
+            html_content += "<thead><tr>"
+            html_content += "<th>Ticker</th>"
+            html_content += "<th>Trades</th>"
+            html_content += "<th>Avg Trades</th>"
+            html_content += "<th>Std Dev</th>"
+            html_content += "<th>Z-score</th>"
+            html_content += "<th>Close Price</th>"
+            html_content += "<th>Price Diff</th>"
+            html_content += "<th>Chart</th>"
+            html_content += "</tr></thead><tbody>"
+            for anomaly in anomalies:
+                html_content += "<tr>"
+                html_content += "<td>{}</td>".format(anomaly["ticker"])
+                html_content += "<td>{}</td>".format(anomaly["trades"])
+                html_content += "<td>{:.2f}</td>".format(anomaly["avg_trades"])
+                html_content += "<td>{:.2f}</td>".format(anomaly["std_trades"])
+                html_content += "<td>{:.2f}</td>".format(anomaly["z_score"])
+                html_content += "<td>{:.2f}</td>".format(anomaly["close_price"])
+                html_content += "<td>{:.2f}</td>".format(anomaly["price_diff"])
+                # Add a link to the chart
+                html_content += (
+                    '<td><a href="/chart?ticker={}&date={}">View Chart</a></td>'.format(
+                        anomaly["ticker"], latest_date
+                    )
+                )
+                html_content += "</tr>"
+            html_content += '</tbody></table><script>new Tablesort(document.getElementById("anomalies"));</script>'
+            html_content += "</div></body></html>"
+            self.wfile.write(html_content.encode())
+        elif path == "/chart":
+            # Handle the chart page
+            # Get 'ticker' and 'date' from query parameters
+            ticker = query_params.get("ticker", [None])[0]
+            date = query_params.get("date", [None])[0]
+            if ticker is None or date is None:
+                # Return an error page
+                self.send_response(400)
+                self.send_header("Content-type", "text/html")
+                self.end_headers()
+                error_html = "<html><body><h1>Error: Missing ticker or date parameter</h1></body></html>"
+                self.wfile.write(error_html.encode())
+            else:
+                # Fetch minute aggregates for the ticker and date
+                client = RESTClient(
+                    trace=True
+                )  # POLYGON_API_KEY environment variable is used
+                try:
+                    aggs = []
+                    date_from = date
+                    date_to = date
+                    for a in client.list_aggs(
+                        ticker,
+                        1,
+                        "minute",
+                        date_from,
+                        date_to,
+                        limit=50000,
+                    ):
+                        aggs.append(a)
+                    # Prepare data for the chart
+                    data = []
+                    for agg in aggs:
+                        if isinstance(agg, Agg) and isinstance(agg.timestamp, int):
+                            new_record = [
+                                agg.timestamp,
+                                agg.open,
+                                agg.high,
+                                agg.low,
+                                agg.close,
+                            ]
+                            data.append(new_record)
+                    # Generate the HTML for the chart page
+                    chart_html = """
+                    <!DOCTYPE HTML>
+                    <html>
+                    <head>
+                    <style>
+                    #container {
+                      height: 750px;
+                      min-width: 310px;
+                    }
+                    </style>
+                    <script src="https://code.highcharts.com/stock/highstock.js"></script>
+                    <script src="https://code.highcharts.com/stock/modules/data.js"></script>
+                    <script src="https://code.highcharts.com/stock/modules/exporting.js"></script>
+                    <script src="https://code.highcharts.com/stock/modules/accessibility.js"></script>
+                    <script src="https://code.highcharts.com/moment/moment.js"></script>
+                    <script src="https://code.highcharts.com/moment-timezone/moment-timezone.js"></script>
+                    </head>
+                    <body>
+                    <div id="container">
+                    <script type="text/javascript">
+					Highcharts.setOptions({
+					    global: {
+					        timezone: 'America/New_York'
+					    }
+					});
+                    var data = %s;
+                    Highcharts.stockChart('container', {
+				        exporting: {
+				            url: 'http://localhost:7801', // Set your local server as the exporting server
+				            enabled: true // Make sure exporting is enabled
+				        },
+					    rangeSelector: {
+					        enabled: false,
+					        selected: 1
+					    },
+					    navigator: {
+					        //enabled: false
+					    },
+					    scrollbar: {
+					        //enabled: false
+					    },
+					    xAxis: {
+					        labels: {
+					            //enabled: true // This hides the time labels under the chart
+					        }
+					    },
+                        title: {
+                          text: '%s Price Data on %s'
+                        },
+                        series: [{
+                          type: 'candlestick',
+                          name: '%s',
+                          data: data,
+                          color: 'red', // Color for downward movement
+                          lineColor: 'red', // Line color for downward movement
+                          upColor: 'green', // Color for upward movement
+                          upLineColor: 'green', // Line color for upward movement
+                          dataGrouping: {
+                            units: [[
+                              'minute',
+                              [1]
+                            ]]
+                          }
+                        }]
+                      });
+                    </script>
+                    </div>
+                    </body>
+                    </html>
+                    """ % (
+                        json.dumps(data),
+                        ticker,
+                        date,
+                        ticker,
+                    )
+                    self.send_response(200)
+                    self.send_header("Content-type", "text/html")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(chart_html.encode())
+                except Exception as e:
+                    # Handle exceptions
+                    self.send_response(500)
+                    self.send_header("Content-type", "text/html")
+                    self.end_headers()
+                    error_html = "<html><body><h1>Error fetching data: {}</h1></body></html>".format(
+                        str(e)
+                    )
+                    self.wfile.write(error_html.encode())
+        else:
+            # Serve files from the current directory
+            super().do_GET()
+
+
+def run_server():
+    with socketserver.TCPServer(("", PORT), handler) as httpd:
+        print("serving at port", PORT)
+        try:
+            httpd.serve_forever()
+        except KeyboardInterrupt:
+            print("\nExiting gracefully...")
+            httpd.shutdown()
+            httpd.server_close()
+
+
+if __name__ == "__main__":
+    run_server()
@@ -0,0 +1,63 @@
+import pickle
+import argparse
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Anomaly Detection Script")
+parser.add_argument("date", type=str, help="Target date in YYYY-MM-DD format")
+args = parser.parse_args()
+
+# Load the lookup_table
+with open("lookup_table.pkl", "rb") as f:
+    lookup_table = pickle.load(f)
+
+# Threshold for considering an anomaly (e.g., 3 standard deviations)
+threshold_multiplier = 3
+
+# Date for which we want to find anomalies
+target_date_str = args.date
+
+# List to store anomalies
+anomalies = []
+
+# Iterate over all tickers in the lookup table
+for ticker, date_data in lookup_table.items():
+    if target_date_str in date_data:
+        data = date_data[target_date_str]
+        trades = data["trades"]
+        avg_trades = data["avg_trades"]
+        std_trades = data["std_trades"]
+        if avg_trades is not None and std_trades is not None and std_trades > 0:
+            z_score = (trades - avg_trades) / std_trades
+            if z_score > threshold_multiplier:
+                anomalies.append(
+                    {
+                        "ticker": ticker,
+                        "date": target_date_str,
+                        "trades": trades,
+                        "avg_trades": avg_trades,
+                        "std_trades": std_trades,
+                        "z_score": z_score,
+                        "close_price": data["close_price"],
+                        "price_diff": data["price_diff"],
+                    }
+                )
+
+# Sort anomalies by trades in descending order
+anomalies.sort(key=lambda x: x["trades"], reverse=True)
+
+# Print the anomalies with aligned columns
+print(f"\nAnomalies Found for {target_date_str}:\n")
+print(
+    f"{'Ticker':<10}{'Trades':>10}{'Avg Trades':>15}{'Std Dev':>10}{'Z-score':>10}{'Close Price':>12}{'Price Diff':>12}"
+)
+print("-" * 91)
+for anomaly in anomalies:
+    print(
+        f"{anomaly['ticker']:<10}"
+        f"{anomaly['trades']:>10.0f}"
+        f"{anomaly['avg_trades']:>15.2f}"
+        f"{anomaly['std_trades']:>10.2f}"
+        f"{anomaly['z_score']:>10.2f}"
+        f"{anomaly['close_price']:>12.2f}"
+        f"{anomaly['price_diff']:>12.2f}"
+    )