Skip to content

Commit 37d25ff

Browse files
committed
added dataset registry and column compatibility analyzer
1 parent d0b5139 commit 37d25ff

File tree

9 files changed

+897
-1
lines changed

9 files changed

+897
-1
lines changed
Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
import { DatasetRegistry } from '../dataset-registry/dataset-registry';
2+
import { Column, ColumnType, Dataset } from '../dataset-registry/types';
3+
import { ColumnCompatibilityAnalyzer } from './column-compatibility-analyzer';
4+
import {
5+
NAME_EXACT_MATCH,
6+
NAME_PARTIAL_MATCH,
7+
SCHEMA_COMPATIBILITY_MATCH,
8+
TYPE_COMPATIBILITY_MATCH,
9+
} from './constants';
10+
11+
class TestableColumnCompatibilityAnalyzer extends ColumnCompatibilityAnalyzer {
12+
public testGetTypeCompatibilityScore(
13+
sourceType: ColumnType,
14+
targetType: ColumnType
15+
): number {
16+
return this['getTypeCompatibilityScore'](sourceType, targetType);
17+
}
18+
19+
public testGetNameSimilarityScore(
20+
sourceName: string,
21+
targetName: string
22+
): number {
23+
return this['getNameSimilarityScore'](sourceName, targetName);
24+
}
25+
26+
public testGetSchemaCompatibilityScore(
27+
sourceColumn: Column,
28+
targetColumn: Column
29+
): number {
30+
return this['getSchemaCompatibilityScore'](sourceColumn, targetColumn);
31+
}
32+
33+
public testNormalizeColumnName(name: string): string {
34+
return this['normalizeColumnName'](name);
35+
}
36+
37+
public testAssessCompatibility(sourceColumn: Column, targetColumn: Column) {
38+
return this['assessCompatibility'](sourceColumn, targetColumn);
39+
}
40+
}
41+
42+
describe('ColumnCompatibilityAnalyzer', () => {
43+
let compatibleAnalyzer: ColumnCompatibilityAnalyzer;
44+
let mockRegistry: DatasetRegistry;
45+
46+
const mockDatasets: Dataset[] = [
47+
{
48+
id: 'dataset1',
49+
name: 'Dataset 1',
50+
columns: [
51+
{
52+
name: 'user_id',
53+
dataType: 'number',
54+
schema: { type: 'integer' },
55+
},
56+
{
57+
name: 'email',
58+
dataType: 'string',
59+
schema: { type: 'string', format: 'email' },
60+
},
61+
],
62+
},
63+
{
64+
id: 'dataset2',
65+
name: 'Dataset 2',
66+
columns: [
67+
{
68+
name: 'userId',
69+
dataType: 'number',
70+
schema: { type: 'integer' },
71+
},
72+
{
73+
name: 'name',
74+
dataType: 'string',
75+
schema: { type: 'string' },
76+
},
77+
],
78+
},
79+
];
80+
81+
beforeEach(() => {
82+
mockRegistry = new DatasetRegistry();
83+
mockDatasets.forEach((dataset) => mockRegistry.registerDataset(dataset));
84+
compatibleAnalyzer = new ColumnCompatibilityAnalyzer(mockRegistry);
85+
});
86+
87+
describe('findCompatibleColumns', () => {
88+
it('should find compatible columns based on type, name, and schema', () => {
89+
const result = compatibleAnalyzer.findCompatibleColumns({
90+
sourceDatasetId: 'dataset1',
91+
sourceColumnName: 'user_id',
92+
});
93+
94+
expect(result).toHaveLength(1);
95+
expect(result[0].column.name).toBe('userId');
96+
expect(result[0].dataset.id).toBe('dataset2');
97+
});
98+
99+
it('should throw error when source column not found', () => {
100+
expect(() =>
101+
compatibleAnalyzer.findCompatibleColumns({
102+
sourceDatasetId: 'dataset1',
103+
sourceColumnName: 'unique_column',
104+
})
105+
).toThrow('Column unique_column not found in dataset dataset1');
106+
});
107+
});
108+
109+
describe('doesJoinPathExist', () => {
110+
it('should return join path for compatible columns', () => {
111+
const joinPath = {
112+
sourceDatasetId: 'dataset1',
113+
sourceColumnName: 'user_id',
114+
destinationDatasetId: 'dataset2',
115+
destinationColumnName: 'userId',
116+
};
117+
118+
const result = compatibleAnalyzer.doesJoinPathExist(joinPath);
119+
expect(result).toEqual(joinPath);
120+
});
121+
122+
it('should throw error for incompatible columns', () => {
123+
const joinPath = {
124+
sourceDatasetId: 'dataset1',
125+
sourceColumnName: 'email',
126+
destinationDatasetId: 'dataset2',
127+
destinationColumnName: 'userId',
128+
};
129+
130+
expect(() => compatibleAnalyzer.doesJoinPathExist(joinPath)).toThrow(
131+
'Columns are not compatible for joining'
132+
);
133+
});
134+
});
135+
136+
describe('name similarity scoring', () => {
137+
it('should match exact names ignoring case and special characters', () => {
138+
const result = compatibleAnalyzer.findCompatibleColumns({
139+
sourceDatasetId: 'dataset1',
140+
sourceColumnName: 'user_id',
141+
});
142+
143+
expect(result).toHaveLength(1);
144+
expect(result[0].column.name).toBe('userId');
145+
});
146+
});
147+
148+
describe('schema compatibility', () => {
149+
beforeEach(() => {
150+
mockRegistry.registerDataset({
151+
id: 'dataset3',
152+
name: 'Dataset 3',
153+
columns: [
154+
{
155+
name: 'email',
156+
dataType: 'string',
157+
schema: { type: 'string', format: 'email' },
158+
},
159+
],
160+
});
161+
});
162+
163+
it('should consider schema when scoring compatibility', () => {
164+
const result = compatibleAnalyzer.findCompatibleColumns({
165+
sourceDatasetId: 'dataset1',
166+
sourceColumnName: 'email',
167+
});
168+
169+
expect(result).toHaveLength(2);
170+
expect(result[0].column.name).toBe('email');
171+
expect(result[0].dataset.id).toBe('dataset3');
172+
});
173+
174+
it('should handle missing schema gracefully', () => {
175+
const noSchemaDataset: Dataset = {
176+
id: 'dataset4',
177+
name: 'Dataset 4',
178+
columns: [
179+
{
180+
name: 'id',
181+
dataType: 'number',
182+
},
183+
],
184+
};
185+
mockRegistry.registerDataset(noSchemaDataset);
186+
187+
const result = compatibleAnalyzer.findCompatibleColumns({
188+
sourceDatasetId: 'dataset4',
189+
sourceColumnName: 'id',
190+
});
191+
192+
expect(result).toHaveLength(2);
193+
});
194+
});
195+
196+
describe('ColumnCompatibilityAnalyzer PRIVATE METHODS', () => {
197+
let analyzer: TestableColumnCompatibilityAnalyzer;
198+
let registry: DatasetRegistry;
199+
200+
beforeEach(() => {
201+
registry = new DatasetRegistry();
202+
analyzer = new TestableColumnCompatibilityAnalyzer(registry);
203+
});
204+
205+
describe('getTypeCompatibilityScore', () => {
206+
it('should return full score for matching types', () => {
207+
expect(analyzer.testGetTypeCompatibilityScore('string', 'string')).toBe(
208+
TYPE_COMPATIBILITY_MATCH
209+
);
210+
expect(analyzer.testGetTypeCompatibilityScore('number', 'number')).toBe(
211+
TYPE_COMPATIBILITY_MATCH
212+
);
213+
});
214+
215+
it('should return 0 for different types', () => {
216+
expect(analyzer.testGetTypeCompatibilityScore('string', 'number')).toBe(
217+
0
218+
);
219+
expect(
220+
analyzer.testGetTypeCompatibilityScore('boolean', 'string')
221+
).toBe(0);
222+
});
223+
});
224+
225+
describe('getNameSimilarityScore', () => {
226+
it('should return exact match score for identical names', () => {
227+
expect(analyzer.testGetNameSimilarityScore('user_id', 'user_id')).toBe(
228+
NAME_EXACT_MATCH
229+
);
230+
expect(analyzer.testGetNameSimilarityScore('userId', 'userId')).toBe(
231+
NAME_EXACT_MATCH
232+
);
233+
});
234+
235+
it('should return partial match score for similar names', () => {
236+
expect(analyzer.testGetNameSimilarityScore('user_id', 'userId')).toBe(
237+
NAME_EXACT_MATCH
238+
);
239+
expect(analyzer.testGetNameSimilarityScore('customer_id', 'id')).toBe(
240+
NAME_PARTIAL_MATCH
241+
);
242+
});
243+
244+
it('should return 0 for different names', () => {
245+
expect(
246+
analyzer.testGetNameSimilarityScore('user_id', 'product_name')
247+
).toBe(0);
248+
});
249+
});
250+
251+
describe('getSchemaCompatibilityScore', () => {
252+
it('should return full score for matching schemas', () => {
253+
const schema1 = { type: 'string', length: 255 };
254+
const column1: Column = {
255+
name: 'test1',
256+
dataType: 'string',
257+
schema: schema1,
258+
};
259+
const column2: Column = {
260+
name: 'test2',
261+
dataType: 'string',
262+
schema: schema1,
263+
};
264+
265+
expect(analyzer.testGetSchemaCompatibilityScore(column1, column2)).toBe(
266+
SCHEMA_COMPATIBILITY_MATCH
267+
);
268+
});
269+
270+
it('should return 0 for different schemas', () => {
271+
const column1: Column = {
272+
name: 'test1',
273+
dataType: 'string',
274+
schema: { type: 'string', length: 255 },
275+
};
276+
const column2: Column = {
277+
name: 'test2',
278+
dataType: 'string',
279+
schema: { type: 'string', length: 100 },
280+
};
281+
282+
expect(analyzer.testGetSchemaCompatibilityScore(column1, column2)).toBe(
283+
0
284+
);
285+
});
286+
287+
it('should return 0 when schemas are missing', () => {
288+
const column1: Column = { name: 'test1', dataType: 'string' };
289+
const column2: Column = { name: 'test2', dataType: 'string' };
290+
291+
expect(analyzer.testGetSchemaCompatibilityScore(column1, column2)).toBe(
292+
0
293+
);
294+
});
295+
});
296+
297+
describe('normalizeColumnName', () => {
298+
it('should convert to lowercase and remove special characters', () => {
299+
expect(analyzer.testNormalizeColumnName('User_ID')).toBe('userid');
300+
expect(analyzer.testNormalizeColumnName('customer-id')).toBe(
301+
'customerid'
302+
);
303+
expect(analyzer.testNormalizeColumnName('ProductName')).toBe(
304+
'productname'
305+
);
306+
});
307+
});
308+
309+
describe('assessCompatibility', () => {
310+
it('should calculate total compatibility score correctly', () => {
311+
const column1: Column = {
312+
name: 'user_id',
313+
dataType: 'string',
314+
schema: { type: 'string', length: 255 },
315+
};
316+
const column2: Column = {
317+
name: 'user_id',
318+
dataType: 'string',
319+
schema: { type: 'string', length: 255 },
320+
};
321+
322+
const result = analyzer.testAssessCompatibility(column1, column2);
323+
324+
expect(result).toEqual({
325+
typeScore: TYPE_COMPATIBILITY_MATCH,
326+
nameScore: NAME_EXACT_MATCH,
327+
schemaScore: SCHEMA_COMPATIBILITY_MATCH,
328+
totalScore:
329+
TYPE_COMPATIBILITY_MATCH +
330+
NAME_EXACT_MATCH +
331+
SCHEMA_COMPATIBILITY_MATCH,
332+
});
333+
});
334+
335+
it('should return default score when types do not match', () => {
336+
const column1: Column = { name: 'test1', dataType: 'string' };
337+
const column2: Column = { name: 'test1', dataType: 'number' };
338+
339+
const result = analyzer.testAssessCompatibility(column1, column2);
340+
341+
expect(result.totalScore).toBe(0);
342+
});
343+
});
344+
});
345+
});

0 commit comments

Comments
 (0)