Skip to content

add encoding support for CSV files #767

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 11 additions & 2 deletions documentation/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
- `title`: Name of the app displayed in the interface.
- `version`: Version of the app.
- `port`: Port the app runs on (default is `8501`).
- `disable_reactivity`: (optional) Set to true to disable Preswalds reactive runtime. When disabled, Preswald will rerun the entire script on every update instead of selectively recomputing affected parts using its dependency graph (DAG). This can be useful for debugging, performance benchmarking, or in environments where reactivity fallback is expected.
- `disable_reactivity`: (optional) Set to true to disable Preswald's reactive runtime. When disabled, Preswald will rerun the entire script on every update instead of selectively recomputing affected parts using its dependency graph (DAG). This can be useful for debugging, performance benchmarking, or in environments where reactivity fallback is expected.

### `[branding]`

Expand All @@ -65,6 +65,10 @@ You can use a local or remote CSV file as a data source by defining it in `presw

- `type`: Use `"csv"`.
- `path`: Relative or absolute path to the CSV file, or a link to one
- `encoding` (optional): Character encoding of the CSV file. Defaults to `"utf-8"`. Common values:
- `"utf-8"`: Standard UTF-8 encoding (default)
- `"latin-1"`: ISO-8859-1 encoding (also known as Latin-1)
- `"utf-16"`: UTF-16 encoding

#### Example CSV Connections:

Expand All @@ -76,6 +80,11 @@ path = "data/customers.csv"
[data.sample_csv]
type = "csv"
path = "https://storage.googleapis.com/test/sample_data.csv"

[data.latin1_csv]
type = "csv"
path = "data/legacy_data.csv"
encoding = "latin-1" # For ISO-8859-1 encoded files
```

If the CSV file is located in a subdirectory, make sure the `path` is correct relative to the root directory.
Expand Down Expand Up @@ -236,4 +245,4 @@ To disable telemetry data collection, add this to your `preswald.toml`:
enabled = false # Disables all telemetry data collection
```

If the `[telemetry]` section is not present in your configuration, telemetry will be enabled by default to help improve Preswald.
If the `[telemetry]` section is not present in your configuration, telemetry will be enabled by default to help improve Preswald.
11 changes: 8 additions & 3 deletions preswald/engine/managers/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class PostgresConfig:
@dataclass
class CSVConfig:
path: str
encoding: str = "utf-8" # default to utf-8 for backward compatibility


@dataclass
Expand Down Expand Up @@ -152,7 +153,8 @@ def __init__(
ignore_errors=true,
normalize_names=false,
sample_size=-1,
all_varchar=true
all_varchar=true,
encoding='{config.encoding}'
)
""")

Expand Down Expand Up @@ -428,7 +430,10 @@ def connect(self): # noqa: C901

try:
if source_type == "csv":
cfg = CSVConfig(path=source_config["path"])
cfg = CSVConfig(
path=source_config["path"],
encoding=source_config.get("encoding", "utf-8")
)
self.sources[name] = CSVSource(name, cfg, self.duckdb_conn)

elif source_type == "json":
Expand Down Expand Up @@ -520,7 +525,7 @@ def _get_or_create_source(self, source_name: str) -> DataSource:
# check if source_name is a valid file path
if os.path.exists(source_name):
if source_name.endswith(".csv"):
cfg = CSVConfig(path=source_name)
cfg = CSVConfig(path=source_name, encoding="utf-8")
self.sources[source_name] = CSVSource(
source_name, cfg, self.duckdb_conn
)
Expand Down
1 change: 1 addition & 0 deletions preswald/tutorial/preswald.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ primaryColor = "#000000"
[data.sample_csv]
type = "csv"
path = "data/sample.csv"
# encoding = "latin-1" # Uncomment and set to "latin-1" for ISO-8859-1 encoded files

[logging]
level = "INFO" # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
Expand Down
130 changes: 130 additions & 0 deletions simple_encoding_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Simple test script to verify that DuckDB's CSV encoding support works correctly.
This script creates a test CSV file with ISO-8859-1 encoding and tests loading it.
"""

import os
import tempfile
import duckdb

def create_test_csv_with_latin1():
"""Create a test CSV file with ISO-8859-1 encoding containing special characters."""
# Create a temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='latin-1') as f:
# Write CSV content with Latin-1 characters
f.write("name,value,description\n")
f.write("José,123,áéíóú\n")
f.write("François,456,ñç\n")
f.write("Müller,789,ßäöü\n")
temp_file = f.name

return temp_file

def test_utf8_encoding_fails():
"""Test that loading with UTF-8 encoding fails on Latin-1 file."""
temp_file = create_test_csv_with_latin1()

try:
# Try to load with UTF-8 encoding (should fail)
conn = duckdb.connect(':memory:')
result = conn.execute(f"""
SELECT * FROM read_csv_auto('{temp_file}',
header=true,
auto_detect=true,
ignore_errors=true,
normalize_names=false,
sample_size=-1,
all_varchar=true,
encoding='utf-8'
)
""").df()

print("✓ UTF-8 encoding test completed")

except Exception as e:
print(f"✓ UTF-8 encoding failed as expected: {e}")
finally:
conn.close()
os.unlink(temp_file)

def test_latin1_encoding_succeeds():
"""Test that loading with Latin-1 encoding succeeds on Latin-1 file."""
temp_file = create_test_csv_with_latin1()

try:
# Load with Latin-1 encoding (should succeed)
conn = duckdb.connect(':memory:')
result = conn.execute(f"""
SELECT * FROM read_csv_auto('{temp_file}',
header=true,
auto_detect=true,
ignore_errors=true,
normalize_names=false,
sample_size=-1,
all_varchar=true,
encoding='latin-1'
)
""").df()

print(f"✓ Latin-1 encoding succeeded! Loaded {len(result)} rows")
print(f" Columns: {list(result.columns)}")
print(f" Sample data:")
for i, row in result.iterrows():
print(f" {row['name']}, {row['value']}, {row['description']}")

except Exception as e:
print(f"✗ Latin-1 encoding failed: {e}")
finally:
conn.close()
os.unlink(temp_file)

def test_default_encoding():
"""Test that default encoding (UTF-8) works for regular files."""
# Create a regular UTF-8 CSV file
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
f.write("name,value\n")
f.write("John,100\n")
f.write("Jane,200\n")
temp_file = f.name

try:
# Load with default encoding (should succeed)
conn = duckdb.connect(':memory:')
result = conn.execute(f"""
SELECT * FROM read_csv_auto('{temp_file}',
header=true,
auto_detect=true,
ignore_errors=true,
normalize_names=false,
sample_size=-1,
all_varchar=true
)
""").df()

print(f"✓ Default encoding succeeded! Loaded {len(result)} rows")
print(f" Sample data:")
for i, row in result.iterrows():
print(f" {row['name']}, {row['value']}")

except Exception as e:
print(f"✗ Default encoding failed: {e}")
finally:
conn.close()
os.unlink(temp_file)

if __name__ == "__main__":
print("Testing DuckDB CSV encoding support...")
print("=" * 50)

test_default_encoding()
print()

test_utf8_encoding_fails()
print()

test_latin1_encoding_succeeds()
print()

print("=" * 50)
print("Test completed!")
108 changes: 108 additions & 0 deletions test_encoding_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python3
"""
Test script to verify that the CSV encoding fix works correctly.
This script creates a test CSV file with ISO-8859-1 encoding and tests loading it.
"""

import os
import tempfile
import pandas as pd
from preswald.engine.managers.data import CSVConfig, CSVSource
import duckdb

def create_test_csv_with_latin1():
"""Create a test CSV file with ISO-8859-1 encoding containing special characters."""
# Create a temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='latin-1') as f:
# Write CSV content with Latin-1 characters
f.write("name,value,description\n")
f.write("José,123,áéíóú\n")
f.write("François,456,ñç\n")
f.write("Müller,789,ßäöü\n")
temp_file = f.name

return temp_file

def test_utf8_encoding_fails():
"""Test that loading with UTF-8 encoding fails on Latin-1 file."""
temp_file = create_test_csv_with_latin1()

try:
# Try to load with UTF-8 encoding (should fail)
conn = duckdb.connect(':memory:')
config = CSVConfig(path=temp_file, encoding="utf-8")
source = CSVSource("test_csv", config, conn)

# If we get here, it means the file loaded successfully with UTF-8
# which might happen if the file doesn't contain problematic characters
print("✓ UTF-8 encoding test completed")

except Exception as e:
print(f"✓ UTF-8 encoding failed as expected: {e}")
finally:
conn.close()
os.unlink(temp_file)

def test_latin1_encoding_succeeds():
"""Test that loading with Latin-1 encoding succeeds on Latin-1 file."""
temp_file = create_test_csv_with_latin1()

try:
# Load with Latin-1 encoding (should succeed)
conn = duckdb.connect(':memory:')
config = CSVConfig(path=temp_file, encoding="latin-1")
source = CSVSource("test_csv", config, conn)

# Try to query the data
df = source.to_df()
print(f"✓ Latin-1 encoding succeeded! Loaded {len(df)} rows")
print(f" Columns: {list(df.columns)}")
print(f" Sample data: {df.head().to_dict()}")

except Exception as e:
print(f"✗ Latin-1 encoding failed: {e}")
finally:
conn.close()
os.unlink(temp_file)

def test_default_encoding():
"""Test that default encoding (UTF-8) works for regular files."""
# Create a regular UTF-8 CSV file
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
f.write("name,value\n")
f.write("John,100\n")
f.write("Jane,200\n")
temp_file = f.name

try:
# Load with default encoding (should succeed)
conn = duckdb.connect(':memory:')
config = CSVConfig(path=temp_file) # No encoding specified, should default to utf-8
source = CSVSource("test_csv", config, conn)

# Try to query the data
df = source.to_df()
print(f"✓ Default encoding succeeded! Loaded {len(df)} rows")
print(f" Sample data: {df.head().to_dict()}")

except Exception as e:
print(f"✗ Default encoding failed: {e}")
finally:
conn.close()
os.unlink(temp_file)

if __name__ == "__main__":
print("Testing CSV encoding fix...")
print("=" * 50)

test_default_encoding()
print()

test_utf8_encoding_fails()
print()

test_latin1_encoding_succeeds()
print()

print("=" * 50)
print("Test completed!")