diff --git a/data/omniglot/preprocess.sh b/data/omniglot/preprocess.sh new file mode 100755 index 00000000..61e3c374 --- /dev/null +++ b/data/omniglot/preprocess.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -x + +# download data and convert to .json format +if [ ! -d "data/all_data" ] || [ ! "$(ls -A data/all_data)" ]; then + pushd preprocess + ./data_to_json.sh + popd +fi + +NAME="omniglot" # name of the dataset, equivalent to directory name + +cd ../utils + +./preprocess.sh --name $NAME $@ + +cd ../$NAME diff --git a/data/omniglot/preprocess/data_to_json.py b/data/omniglot/preprocess/data_to_json.py new file mode 100644 index 00000000..a6e85da2 --- /dev/null +++ b/data/omniglot/preprocess/data_to_json.py @@ -0,0 +1,45 @@ +import os +import json +import glob +import numpy as np + +from PIL import Image +from collections import defaultdict + +image_size = (28, 28) +status_update_after = 5000 # images processed + +user_class = dict() +user_data = defaultdict(dict) + +parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +image_paths = os.path.join(parent_path, 'data', 'raw_data', 'images_*', '*', '*', '*.png') + +for i, character_file in enumerate(glob.glob(image_paths)): + character_class = '.'.join(character_file.split('/')[-4:-1]) + user_id = character_file.split('/')[-1].split('_')[0] + # instance_num = character_file.split('/')[-1].split('_')[1].split('.')[0] + + img = Image.open(character_file).resize(image_size, resample=Image.LANCZOS) + flattened_img = np.array(img.convert('L')).flatten() / 255. + + if user_id not in user_class: + user_class[user_id] = character_class + user_data[user_id]['x'] = list() + user_data[user_id]['y'] = list() + user_data[user_id]['x'].append(flattened_img.tolist()) + user_data[user_id]['y'].append(user_id) + + if (i+1) % status_update_after == 0: + print ("{} images converted".format(i+1)) + +all_data = dict() +all_data['users'] = list(user_class.keys()) +all_data['num_samples'] = [ len(user_data[x]['x']) for x in all_data['users'] ] +all_data['user_data'] = user_data + +file_name = 'all_data.json' +file_path = os.path.join(parent_path, 'data', 'all_data', file_name) + +with open(file_path, 'w') as outfile: + json.dump(all_data, outfile) diff --git a/data/omniglot/preprocess/data_to_json.sh b/data/omniglot/preprocess/data_to_json.sh new file mode 100755 index 00000000..85d80bb5 --- /dev/null +++ b/data/omniglot/preprocess/data_to_json.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# Setup data and raw_data directories, if they don't already exist +if [ ! -d "../data/raw_data" ]; then + mkdir -p ../data/raw_data +fi + +# Check and download data if needed +./get_data.sh + +if [ ! "$(ls -A ../data/all_data)" ]; then + mkdir -p ../data/all_data + echo "------------------------------" + echo "converting data to .json format" + python3 data_to_json.py + echo "finished converting data to .json format" +fi diff --git a/data/omniglot/preprocess/get_data.sh b/data/omniglot/preprocess/get_data.sh new file mode 100755 index 00000000..30c03d4a --- /dev/null +++ b/data/omniglot/preprocess/get_data.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +DOWNLOAD_URL="https://raw.githubusercontent.com/brendenlake/omniglot/master/python/" +declare -a data_folders=( "images_background" "images_evaluation" ) + +pushd "../data/raw_data" + echo "------------------------------" + for data_folder in "${data_folders[@]}"; do + if [ ! -d "${data_folder}" ]; then + echo "Downloading ${data_folder}" + wget --no-check-certificate "${DOWNLOAD_URL}/${data_folder}.zip" + unzip "${data_folder}.zip" + rm ${data_folder}.zip + else + echo "Found Omniglot image directory ${data_folder}" + fi + done +popd