ml5js · rodjuncode · Aug 4, 2020
diff --git a/train.py b/train.py
@@ -70,7 +70,7 @@
 # Each sentence is an array of words.
 final_sentences = []
 for file in listOfFiles:
-    text = open(file).read().lower().replace("\n", " ") # Remove lineabreaks
+    text = open(file,encoding="utf-8").read().lower().replace("\n", " ") # Remove lineabreaks
 
     # Remove all the stop words before running the actual tokenization.
     # I think it's a little bit cleaner to do it here and may perform because
@@ -100,7 +100,7 @@
 model.wv.save_word2vec_format(output_text_file, binary=False)
 
 # Open up that text file and convert to JSON
-f = open(output_text_file)
+f = open(output_text_file,encoding="utf-8")
 v = {"vectors": {}}
 for line in f:
     w, n = line.split(" ", 1)
@@ -109,5 +109,5 @@
 # Save to a JSON file
 # Could make this an optional argument to specify output file
 with open(output_text_file[:-4] + "json", "w") as out:
-    json.dump(v, out)
-
+    json.dump(v, out, ensure_ascii=False)
+