Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions code/upload_to_hugging_face/update_descriptions/list_datasets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
oscur/automated-traffic-volume-counts-sample: automated-traffic-volume-counts-sample.yaml
oscur/automated-traffic-volume-counts: automated-traffic-volume-counts.yaml
oscur/NYC_vehicle_collisions_issue: NYC_vehicle_collisions_issue.yaml
# oscur/NYC_vehicle_collisions: NYC_vehicle_collisions.yaml
oscur/pluto: pluto.yaml
# oscur/NYC_311: NYC_311.yaml
oscur/taxisvis1M: taxisvis1M.yaml
oscur/NYC_raised_crosswalk: NYC_raised_crosswalk.yaml
# oscur/NYC_speed_humps: speed_humps.yaml
oscur/NYC_on_street_curb_management: NYC_on_street_curb_management.yaml
oscur/NYC_signals_markings_signs: NYC_signals_markings_signs.yaml
oscur/NYC_transit_stop_accessibility: NYC_transit_stop_accessibility.yaml
oscur/NYC_sidewalk_surface_condition: NYC_sidewalk_surface_condition.yaml
# oscur/NYC_facilities: NYC_vehicle_collisions.yaml
oscur/NYC_issued_licenses: NYC_issued_licenses.yaml
oscur/NYC_truck_routes: NYC_truck_routes.yaml
oscur/NYC_open_space: NYC_open_space.yaml
oscur/NYC_parks: NYC_parks.yaml
oscur/NYC_heat_vulnerability: NYC_heat_vulnerability.yaml
oscur/NYC_fixed_obstructions_on_sidewalks: NYC_fixed_obstructions_on_sidewalks.yaml
oscur/NYC_motor_vehicle_crashes: NYC_motor_vehicle_crashes.yaml
oscur/NYC_motor_vehicle_persons: NYC_motor_vehicle_persons.yaml
oscur/NYC_tree_cover_landscaping: NYC_tree_cover_landscaping.yaml
oscur/NYC_subway_stations: NYC_subway_stations.yaml
oscur/NYC_intercity_bus_stops: NYC_intercity_bus_stops.yaml
oscur/NYC_rail_routes_and_crossings: NYC_rail_routes_and_crossings.yaml
oscur/NYC_urban_design_and_frontage: NYC_urban_design_and_frontage.yaml
oscur/NYC_public_open_spaces: NYC_public_open_spaces.yaml
# oscur/NYC_bicycle_facilities: bicycle_facilities.yaml
oscur/NYC_ferry_ridership: NYC_ferry_ridership.yaml
oscur/NYC_1M_mta_bus_hourly: NYC_1M_mta_bus_hourly.yaml
oscur/NYC_1M_mta_subway_hourly: NYC_1M_mta_subway_hourly.yaml
oscur/NYC_NYCHA_residential_sites: NYC_NYCHA_residential_sites.yaml
oscur/NYC_sidewalks_with_land_use: NYC_sidewalks_with_land_use.yaml
oscur/NYC_historic_land_use: NYC_historic_land_use.yaml
# oscur/NYC_curb_infrastructure: curb_infrastructure.yaml
oscur/NYC_speed_distributions: NYC_speed_distributions.yaml
oscur/NYC_Bi_Annual_Pedestrian_Counts: NYC_Bi_Annual_Pedestrian_Counts.yaml
# oscur/NYC_bicycle_counts: bicycle_pedestrian_trip_counts.yaml
oscur/NYC_pedestrian_intervals: NYC_pedestrian_intervals.yaml
oscur/NYC_community_health_survey: NYC_community_health_survey.yaml
oscur/NYC_roadway_features: NYC_roadway_features.yaml
oscur/NYC_socioeconomic_and_demographic: NYC_socioeconomic_and_demographic.yaml
oscur/NYC_final_disadvantaged_communities: NYC_final_disadvantaged_communities.yaml
oscur/NYC_housing_density: NYC_housing_density.yaml
oscur/NYC_vehicle_volume_and_types: NYC_vehicle_volume_and_types.yaml
oscur/NYC_topography: NYC_topography.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
# Usage:
# bash run_update_hf_descriptions.sh <hf_token>

HF_TOKEN=$1

if [ -z "$HF_TOKEN" ]; then
echo "Usage: bash run_update_hf_descriptions.sh <hf_token>"
exit 1
fi

# Define dataset–YAML pairs
declare -A DATASETS
DATASETS["oscur/NYC_vehicle_collisions_issue"]="https://raw.githubusercontent.com/VIDA-NYU/OSCUR-data/main/metadata/NYC_vehicle_collisions.yaml"
DATASETS["oscur/NYC_ferry_ridership"]="https://raw.githubusercontent.com/VIDA-NYU/OSCUR-data/main/metadata/NYC_ferry_ridership.yaml"

# Run the Python updater for each dataset
for repo_id in "${!DATASETS[@]}"; do
yaml_url="${DATASETS[$repo_id]}"
echo "➡️ Updating $repo_id ..."
python3 update_hf_description.py "$repo_id" "$yaml_url" "$HF_TOKEN"
done
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""
update_hf_description.py

Utility functions to fetch dataset descriptions from YAML metadata files
and update the README.md (dataset card) on the Hugging Face Hub.
"""

import sys
import yaml
import requests
from huggingface_hub import HfApi, HfFileSystem
from pathlib import Path
import io

def get_description_from_yaml(yaml_url: str) -> str:
"""Fetch YAML file from a URL and extract the 'description' field."""
try:
response = requests.get(yaml_url)
response.raise_for_status()
metadata = yaml.safe_load(response.text)
description = metadata.get("description", "No description found in YAML file.")
return description.strip()
except Exception as e:
print(f"⚠️ Error fetching {yaml_url}: {e}")
return "No description available (fetch error)."

def update_dataset_description(repo_id: str, new_description: str, token: str):
"""
Update or add the 'description' field in a dataset README.md on Hugging Face Hub.
"""
api = HfApi()
fs = HfFileSystem()

# Path to README in the dataset repo
readme_path = f"datasets/{repo_id}/README.md"

# --- Fetch existing README.md content ---
try:
with fs.open(readme_path, "r") as f: # Read existing README.md content from the hub
content = f.read()
except FileNotFoundError:
content = ""

# --- Split YAML metadata (front matter) and Markdown body ---
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
yaml_block = parts[1]
markdown_body = parts[2].lstrip()
metadata = yaml.safe_load(yaml_block) or {}
else:
metadata = {}
markdown_body = content
else:
metadata = {}
markdown_body = content

# --- Update description in YAML metadata ---
metadata["description"] = new_description

# --- Rebuild the README.md ---
new_yaml = yaml.safe_dump(metadata, sort_keys=False, allow_unicode=True)
updated_readme = f"---\n{new_yaml}---\n\n{markdown_body}"

# --- Upload the new README.md back to the Hub ---
# api.upload_file(
# path_or_fileobj=io.BytesIO(updated_readme.encode("utf-8")),
# path_in_repo="README.md",
# repo_id=repo_id,
# repo_type="dataset",
# token=token,
# commit_message="Update dataset description programmatically",
# )

print(f"\n📘 Processing {repo_id} → {yaml_file}")
print(f" Description length: {len(new_description)} chars")
print(f"✅ Updated description for {repo_id}")

if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python update_hf_description.py <dataset_yaml> <hf_token>")
sys.exit(1)

dataset_yaml = Path(sys.argv[1])
hf_token = sys.argv[2]

with open(dataset_yaml, "r") as f:
dataset_map = yaml.safe_load(f)

for repo_id, yaml_file in dataset_map.items():
yaml_url = f"https://raw.githubusercontent.com/VIDA-NYU/OSCUR-data/main/metadata/{yaml_file}"
desc = get_description_from_yaml(yaml_url)
update_dataset_description(repo_id, desc, hf_token)

# if __name__ == "__main__":
# # Allow command-line execution
# if len(sys.argv) != 4:
# print("Usage: python update_hf_description.py <repo_id> <yaml_url> <hf_token>")
# sys.exit(1)

# repo_id, yaml_url, token = sys.argv[1:]
# desc = get_description_from_yaml(yaml_url)
# update_dataset_description(repo_id, desc, token)