From 4db9d1d9d471cb0d404fddc78e6ceab6088ad6f2 Mon Sep 17 00:00:00 2001 From: Manan Soni Date: Wed, 21 Jun 2023 00:31:33 +0530 Subject: [PATCH 001/219] improving code coverage issue #615 --- .../zingg/common/core/model/TestModel.java | 66 ++++++++ .../common/core/sink/TestTableOutput.java | 45 ++++++ .../java/zingg/hash/TestHashFnFromConf.java | 25 +++ .../java/zingg/hash/TestHashFunction.java | 150 ++++++++++++++++++ .../java/zingg/hash/TestIdentityLong.java | 27 ++++ .../zingg/hash/TestLessThanZeroFloat.java | 40 +++++ .../java/zingg/hash/TestLessThanZeroLong.java | 40 +++++ .../hash/TestRangeBetween0And10Float.java | 72 +++++++++ .../hash/TestRangeBetween100And1000Long.java | 71 +++++++++ .../zingg/hash/TestTrimLastDigitsFloat.java | 45 ++++++ .../zingg/hash/TestTrimLastDigitsLong.java | 39 +++++ 11 files changed, 620 insertions(+) create mode 100644 common/core/src/test/java/zingg/common/core/model/TestModel.java create mode 100644 common/core/src/test/java/zingg/common/core/sink/TestTableOutput.java create mode 100644 common/core/src/test/java/zingg/hash/TestHashFnFromConf.java create mode 100644 common/core/src/test/java/zingg/hash/TestHashFunction.java create mode 100644 common/core/src/test/java/zingg/hash/TestIdentityLong.java create mode 100644 common/core/src/test/java/zingg/hash/TestLessThanZeroFloat.java create mode 100644 common/core/src/test/java/zingg/hash/TestLessThanZeroLong.java create mode 100644 common/core/src/test/java/zingg/hash/TestRangeBetween0And10Float.java create mode 100644 common/core/src/test/java/zingg/hash/TestRangeBetween100And1000Long.java create mode 100644 common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java create mode 100644 common/core/src/test/java/zingg/hash/TestTrimLastDigitsLong.java diff --git a/common/core/src/test/java/zingg/common/core/model/TestModel.java b/common/core/src/test/java/zingg/common/core/model/TestModel.java new file mode 100644 index 000000000..ebb3122d4 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/TestModel.java @@ -0,0 +1,66 @@ +package zingg.common.core.model; + +import zingg.common.client.ZFrame; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.model.Model; + +import java.io.IOException; + +public class TestModel { + @Test + public void testGetGrid() { + Model model = getInstance(); + double[] result = model.getGrid(1.0, 10.0, 2.0, false); + double[] expected = {1.0, 3.0, 5.0, 7.0, 9.0}; + assertArrayEquals(expected, result, 0.0); + } + + @Test + public void testGetGridForMultiples() { + Model model = getInstance(); + double[] result = model.getGrid(1.0, 10.0, 2.0, true); + double[] expected = {1.0, 2.0, 4.0, 8.0}; + assertArrayEquals(expected, result, 0.0); + } + + private Model getInstance() { + Model model = new Model() { + @Override + public void register(Object spark) { + } + + @Override + public void fit(ZFrame pos, ZFrame neg) { + } + + @Override + public void load(String path) { + } + + @Override + public ZFrame predict(ZFrame data) { + return null; + } + + @Override + public ZFrame predict(ZFrame data, boolean isDrop) { + return null; + } + + @Override + public void save(String path) throws IOException { + } + + @Override + public ZFrame transform(ZFrame input) { + return null; + } + }; + return model; + } + +} diff --git a/common/core/src/test/java/zingg/common/core/sink/TestTableOutput.java b/common/core/src/test/java/zingg/common/core/sink/TestTableOutput.java new file mode 100644 index 000000000..97f2fc34e --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/sink/TestTableOutput.java @@ -0,0 +1,45 @@ +package zingg.common.core.sink; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.sink.TableOutput; + + +public class TestTableOutput { + + private TableOutput getInstance() { + return new TableOutput(3, 234456L, 87654L, "Company X"); + } + + @Test + public void testGetMethods() { + String ans = "Company X"; + TableOutput value = getInstance(); + assertEquals(3, value.getJobId()); + assertEquals(234456L, value.getTimestamp()); + assertEquals(87654L, value.getClusterId()); + assertEquals(ans, value.getRecord()); + } + + @Test + public void testSetMethods() { + TableOutput value = getInstance(); + int newJobId = 5; + long newTimestamp = 778899L; + long newClusterId = 9876L; + String newRecord = "Company Y"; + + value.setJobId(newJobId); + value.setTimestamp(newTimestamp); + value.setClusterId(newClusterId); + value.setRecord(newRecord); + + assertEquals(5, value.getJobId()); + assertEquals(778899L, value.getTimestamp()); + assertEquals(9876L, value.getClusterId()); + assertEquals(newRecord, value.getRecord()); + } + +} diff --git a/common/core/src/test/java/zingg/hash/TestHashFnFromConf.java b/common/core/src/test/java/zingg/hash/TestHashFnFromConf.java new file mode 100644 index 000000000..c99179c5a --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestHashFnFromConf.java @@ -0,0 +1,25 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.HashFnFromConf; + +@JsonInclude(Include.NON_NULL) +public class TestHashFnFromConf { + @Test + public void testHashFnFromConf() { + HashFnFromConf hashFnFromConf = new HashFnFromConf(); + hashFnFromConf.setName("Micheal"); + assertEquals("Micheal", hashFnFromConf.getName()); + } + + @Test + public void testHashFnFromConf1() { + HashFnFromConf hashFnFromConf = new HashFnFromConf(); + hashFnFromConf.setName(null); + assertEquals(null, hashFnFromConf.getName()); + } +} \ No newline at end of file diff --git a/common/core/src/test/java/zingg/hash/TestHashFunction.java b/common/core/src/test/java/zingg/hash/TestHashFunction.java new file mode 100644 index 000000000..1e46142a8 --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestHashFunction.java @@ -0,0 +1,150 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.client.ZFrame; +import zingg.common.core.hash.HashFunction; + + +public class TestHashFunction { + @Test + public void testGetName() { + HashFunction hashFunction = new HashFunction("initialName") { + @Override + public ZFrame apply(ZFrame ds, String column, String newColumn) { + return null; + } + + @Override + public Object getAs(Integer integer, String column) { + return null; + } + + @Override + public Object getAs(String s, Integer integer, String column) { + return null; + } + + @Override + public Object apply(Integer integer, String column) { + return null; + } + + @Override + public Object apply(String s, Integer integer, String column) { + return null; + } + }; + + String expectedName = "hashFunction"; + hashFunction.setName(expectedName); + assertEquals(expectedName, hashFunction.getName()); + } + @Test + public void testGetReturnType() { + HashFunction hashFunction = new HashFunction("Name", 999L, 888L) { + @Override + public ZFrame apply(ZFrame ds, String column, String newColumn) { + return null; + } + + @Override + public Object getAs(Integer integer, String column) { + return null; + } + + @Override + public Object getAs(String s, Integer integer, String column) { + return null; + } + + @Override + public Object apply(Integer integer, String column) { + return null; + } + + @Override + public Object apply(String s, Integer integer, String column) { + return null; + } + }; + + long returnType = 9999L; + hashFunction.setReturnType(returnType); + assertEquals(returnType, hashFunction.getReturnType()); + + long dataType = 888L; + hashFunction.setDataType(dataType); + assertEquals(dataType, hashFunction.getDataType()); + } + + @Test + public void testIsUdf() { + HashFunction hashFunction = new HashFunction("Name", 999L, 888L, true) { + @Override + public ZFrame apply(ZFrame ds, String column, String newColumn) { + return null; + } + + @Override + public Object getAs(Integer integer, String column) { + return null; + } + + @Override + public Object getAs(String s, Integer integer, String column) { + return null; + } + + @Override + public Object apply(Integer integer, String column) { + return null; + } + + @Override + public Object apply(String s, Integer integer, String column) { + return null; + } + }; + + Boolean isUdf = false; + hashFunction.setUdf(isUdf); + assertEquals(false, hashFunction.isUdf()); + } + + @Test + public void testGetAs() { + HashFunction hashFunction = new HashFunction() { + @Override + public ZFrame apply(ZFrame ds, String column, String newColumn) { + return null; + } + + @Override + public Object getAs(Integer integer, String column) { + return null; + } + + @Override + public Object getAs(String s, Integer integer, String column) { + return null; + } + + @Override + public Object apply(Integer integer, String column) { + return null; + } + + @Override + public Object apply(String s, Integer integer, String column) { + return null; + } + }; + Integer value = 10; + String column = "inputColumn"; + assertEquals(null, hashFunction.getAs(value, column)); + } + +} diff --git a/common/core/src/test/java/zingg/hash/TestIdentityLong.java b/common/core/src/test/java/zingg/hash/TestIdentityLong.java new file mode 100644 index 000000000..dab7404a9 --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestIdentityLong.java @@ -0,0 +1,27 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.IdentityLong; + +public class TestIdentityLong { + + @Test + public void testIdentityLong() { + IdentityLong value = getInstance(); + assertEquals(12345L, value.call(12345L)); + } + + @Test + public void testIdentityLong1() { + IdentityLong value = getInstance(); + assertEquals(null, value.call(null)); + } + + private IdentityLong getInstance() { + return new IdentityLong(); + } + +} diff --git a/common/core/src/test/java/zingg/hash/TestLessThanZeroFloat.java b/common/core/src/test/java/zingg/hash/TestLessThanZeroFloat.java new file mode 100644 index 000000000..63bdb5bf3 --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestLessThanZeroFloat.java @@ -0,0 +1,40 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.LessThanZeroFloat; + +public class TestLessThanZeroFloat { + + @Test + public void testLessThanZeroFloatForValueZero() { + LessThanZeroFloat value = getInstance(); + assertFalse(value.call(0.0f)); + } + + @Test + public void testLessThanZeroFloatForValueNull() { + LessThanZeroFloat value = getInstance(); + assertFalse(value.call(null)); + } + + @Test + public void testLessThanZeroFloatNegativeValue() { + LessThanZeroFloat value = getInstance(); + assertTrue(value.call(-5435.45f)); + } + + @Test + public void testLessThanZeroFloatPositiveValue() { + LessThanZeroFloat value = getInstance(); + assertFalse(value.call(876.457f)); + } + + private LessThanZeroFloat getInstance() { + LessThanZeroFloat value = new LessThanZeroFloat(); + return value; + } +} diff --git a/common/core/src/test/java/zingg/hash/TestLessThanZeroLong.java b/common/core/src/test/java/zingg/hash/TestLessThanZeroLong.java new file mode 100644 index 000000000..44d161752 --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestLessThanZeroLong.java @@ -0,0 +1,40 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.LessThanZeroLong; + +public class TestLessThanZeroLong { + + @Test + public void testLessThanZeroLongForValueZero() { + LessThanZeroLong value = getInstance(); + assertFalse(value.call(0L)); + } + + @Test + public void testLessThanZeroLongForValueNull() { + LessThanZeroLong value = getInstance(); + assertFalse(value.call(null)); + } + + @Test + public void testLessThanZeroLongNegativeValue() { + LessThanZeroLong value = getInstance(); + assertTrue(value.call(-543545L)); + } + + @Test + public void testLessThanZeroLongPositiveValue() { + LessThanZeroLong value = getInstance(); + assertFalse(value.call(876457L)); + } + + private LessThanZeroLong getInstance() { + LessThanZeroLong value = new LessThanZeroLong(); + return value; + } +} diff --git a/common/core/src/test/java/zingg/hash/TestRangeBetween0And10Float.java b/common/core/src/test/java/zingg/hash/TestRangeBetween0And10Float.java new file mode 100644 index 000000000..2b2dfe246 --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestRangeBetween0And10Float.java @@ -0,0 +1,72 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.RangeFloat; + +public class TestRangeBetween0And10Float { + + private RangeFloat getInstance() { + return new RangeFloat(0,10); + } + + @Test + public void testRangeForValueZero() { + RangeFloat value = getInstance(); + assertEquals(1, value.call(0f)); + } + + @Test + public void testRangeForNegativeValue() { + Float input = -100f; + RangeFloat value = getInstance(); + assertEquals(0, value.call(input)); + } + + @Test + public void testRangeForVeryHighValue() { + Float input = 99999f; + RangeFloat value = getInstance(); + assertEquals(0, value.call(input)); + } + + @Test + public void testRangeForValue8() { + RangeFloat value = getInstance(); + assertEquals(1, value.call(8f)); + } + + @Test + public void testRangeForValue65() { + RangeFloat value = getInstance(); + assertEquals(0, value.call(65f)); + } + + @Test + public void testRangeForValue867() { + RangeFloat value = getInstance(); + assertEquals(0, value.call(867f)); + } + @Test + public void testRangeForValue8637() { + RangeFloat value = getInstance(); + assertEquals(0, value.call(8637f)); + } + @Test + public void testRangeForNull() { + RangeFloat value = getInstance(); + assertEquals(0, value.call(null)); + } + @Test + public void testRangeForUpperLimit() { + RangeFloat value = getInstance(); + assertEquals(10, value.getUpperLimit()); + } + @Test + public void testRangeForLowerLimit() { + RangeFloat value = getInstance(); + assertEquals(0, value.getLowerLimit()); + } +} diff --git a/common/core/src/test/java/zingg/hash/TestRangeBetween100And1000Long.java b/common/core/src/test/java/zingg/hash/TestRangeBetween100And1000Long.java new file mode 100644 index 000000000..4c3b9cfea --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestRangeBetween100And1000Long.java @@ -0,0 +1,71 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.RangeLong; + +public class TestRangeBetween100And1000Long { + + private RangeLong getInstance() { + return new RangeLong(100L,1000L); + } + + @Test + public void testRangeForValueZero() { + RangeLong value = getInstance(); + assertEquals(0, value.call(0L)); + } + + @Test + public void testRangeForNegativeValue() { + RangeLong value = getInstance(); + assertEquals(0, value.call(-100L)); + } + + @Test + public void testRangeForVeryHighValue() { + RangeLong value = getInstance(); + assertEquals(0, value.call(999999L)); + } + + @Test + public void testRangeForValue8() { + RangeLong value = getInstance(); + assertEquals(0, value.call(8L)); + } + + @Test + public void testRangeForValue65() { + RangeLong value = getInstance(); + assertEquals(0, value.call(65L)); + } + + @Test + public void testRangeForValue867() { + RangeLong value = getInstance(); + assertEquals(1, value.call(867L)); + } + @Test + public void testRangeForValue8637() { + RangeLong value = getInstance(); + assertEquals(0, value.call(8637L)); + } + @Test + public void testRangeForNull() { + RangeLong value = getInstance(); + assertEquals(0, value.call(null)); + } + @Test + public void testRangeForUpperLimit() { + RangeLong value = getInstance(); + assertEquals(1000, value.getUpperLimit()); + } + @Test + public void testRangeForLowerLimit() { + RangeLong value = getInstance(); + assertEquals(100, value.getLowerLimit()); + } + +} diff --git a/common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java b/common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java new file mode 100644 index 000000000..3f9e9ef26 --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java @@ -0,0 +1,45 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.TrimLastDigitsFloat; + +public class TestTrimLastDigitsFloat { + + @Test + public void testTrimLast1DigitFloat() { + TrimLastDigitsFloat value = getInstance(1); + assertEquals(54353f, value.call(543534.677f)); + } + + @Test + public void testTrimLast2DigitsFloat() { + TrimLastDigitsFloat value = getInstance(2); + assertEquals(5435f, value.call(543534.677f)); + } + + @Test + public void testTrimLast3DigitsFloat() { + TrimLastDigitsFloat value = getInstance(3); + assertEquals(543f, value.call(543534.677f)); + } + + @Test + public void testTrimLast3DigitsFloatNaNValue() { + TrimLastDigitsFloat value = getInstance(3); + assertEquals(Float.NaN, value.call(Float.NaN)); + } + + @Test + public void testTrimLast3DigitsFloatNullValue() { + TrimLastDigitsFloat value = getInstance(3); + assertEquals(null, value.call(null)); + } + + private TrimLastDigitsFloat getInstance(int num) { + return new TrimLastDigitsFloat(num); + } + +} diff --git a/common/core/src/test/java/zingg/hash/TestTrimLastDigitsLong.java b/common/core/src/test/java/zingg/hash/TestTrimLastDigitsLong.java new file mode 100644 index 000000000..a8aefc628 --- /dev/null +++ b/common/core/src/test/java/zingg/hash/TestTrimLastDigitsLong.java @@ -0,0 +1,39 @@ +package zingg.hash; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import zingg.common.core.hash.TrimLastDigitsLong; + +public class TestTrimLastDigitsLong { + + @Test + public void testTrimLast1Digit() { + TrimLastDigitsLong value = getInstance(1); + assertEquals(54353L, value.call(543534L)); + } + + @Test + public void testTrimLast2DigitsInt() { + TrimLastDigitsLong value = getInstance(2); + assertEquals(5435L, value.call(543534L)); + } + + @Test + public void testTrimLast3DigitsInt() { + TrimLastDigitsLong value = getInstance(3); + assertEquals(543L, value.call(543534L)); + } + + @Test + public void testTrimLast3DigitsIntNullValue() { + TrimLastDigitsLong value = getInstance(3); + assertEquals(null, value.call(null)); + } + + private TrimLastDigitsLong getInstance(int num) { + return new TrimLastDigitsLong(num); + } + +} \ No newline at end of file From 91d038941e0b9418f428e2e98355d3e6004e3906 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Tue, 22 Aug 2023 20:37:41 +0530 Subject: [PATCH 002/219] client restruc for enterprise --- .../main/java/zingg/common/client/Client.java | 30 +++++++++++++++---- .../java/zingg/spark/client/SparkClient.java | 15 +++++----- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 664364683..643c6cae7 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -24,6 +24,8 @@ public abstract class Client implements Serializable { public static final Log LOG = LogFactory.getLog(Client.class); + protected String zFactoryClassName; + /** * Construct a client to Zingg using provided arguments and spark master. @@ -35,10 +37,14 @@ public abstract class Client implements Serializable { * if issue connecting to master */ - public Client() {} + public Client(String zFactory) { + setZFactoryClassName(zFactory); + } - public Client(Arguments args, ClientOptions options) throws ZinggClientException { + public Client(Arguments args, ClientOptions options, String zFactory) throws ZinggClientException { + setZFactoryClassName(zFactory); this.options = options; + try { buildAndSetArguments(args, options); printAnalyticsBanner(arguments.getCollectMetrics()); @@ -50,14 +56,28 @@ public Client(Arguments args, ClientOptions options) throws ZinggClientException } } - public Client(Arguments args, ClientOptions options, S s) throws ZinggClientException { - this(args, options); + + public String getZFactoryClassName() { + return zFactoryClassName; + } + + public void setZFactoryClassName(String s) { + this.zFactoryClassName = s; + } + + public Client(Arguments args, ClientOptions options, S s, String zFactory) throws ZinggClientException { + this(args, options, zFactory); this.session = s; LOG.debug("Session passed is " + s); if (session != null) zingg.setSession(session); } - public abstract IZinggFactory getZinggFactory() throws Exception;//(IZinggFactory) Class.forName("zingg.ZFactory").newInstance(); + + public IZinggFactory getZinggFactory() throws InstantiationException, IllegalAccessException, ClassNotFoundException{ + LOG.debug("z factory is " + getZFactoryClassName()); + return (IZinggFactory) Class.forName(getZFactoryClassName()).newInstance(); + } + diff --git a/spark/client/src/main/java/zingg/spark/client/SparkClient.java b/spark/client/src/main/java/zingg/spark/client/SparkClient.java index fda95945f..c083e0612 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkClient.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkClient.java @@ -24,14 +24,16 @@ public class SparkClient extends Client, Row, Column, DataType> implements Serializable { private static final long serialVersionUID = 1L; + protected static final String zFactoryClassName = "zingg.spark.core.executor.SparkZFactory"; public SparkClient(Arguments args, ClientOptions options) throws ZinggClientException { - super(args, options); - + super(args, options, zFactoryClassName); } + + public SparkClient(Arguments args, ClientOptions options, ZSparkSession s) throws ZinggClientException { - super(args, options, s); + super(args, options, s, zFactoryClassName); } public SparkClient(Arguments args, ClientOptions options, SparkSession s) throws ZinggClientException { @@ -45,15 +47,12 @@ public SparkClient() { .getOrCreate(); JavaSparkContext ctx = JavaSparkContext.fromSparkContext(session.sparkContext()); JavaSparkContext.jarOfClass(IZingg.class); + */ + super(zFactoryClassName); } - @Override - public IZinggFactory getZinggFactory() throws InstantiationException, IllegalAccessException, ClassNotFoundException{ - return (IZinggFactory) Class.forName("zingg.spark.core.executor.SparkZFactory").newInstance(); - } - @Override public Client, Row, Column, DataType> getClient(Arguments args, From c5cb6e3ca998ba1790076183e5a6390011217798 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 23 Aug 2023 13:24:49 +0530 Subject: [PATCH 003/219] rm dep on zinggoptions --- .../src/main/java/zingg/common/client/IZingg.java | 2 +- .../java/zingg/common/core/executor/Documenter.java | 2 +- .../zingg/common/core/executor/FindAndLabeller.java | 2 +- .../common/core/executor/LabelDataViewHelper.java | 4 ++-- .../java/zingg/common/core/executor/LabelUpdater.java | 2 +- .../main/java/zingg/common/core/executor/Labeller.java | 6 +++--- .../main/java/zingg/common/core/executor/Linker.java | 2 +- .../main/java/zingg/common/core/executor/Matcher.java | 2 +- .../java/zingg/common/core/executor/Recommender.java | 2 +- .../java/zingg/common/core/executor/TrainMatcher.java | 2 +- .../zingg/common/core/executor/TrainingDataFinder.java | 2 +- .../zingg/common/core/executor/TrainingDataModel.java | 4 ++-- .../java/zingg/common/core/executor/ZinggBase.java | 10 ++++++---- .../src/main/java/zingg/spark/client/SparkClient.java | 2 +- .../zingg/spark/core/executor/SparkDocumenter.java | 2 +- .../spark/core/executor/SparkFindAndLabeller.java | 2 +- .../zingg/spark/core/executor/SparkLabelUpdater.java | 2 +- .../java/zingg/spark/core/executor/SparkLabeller.java | 2 +- .../java/zingg/spark/core/executor/SparkLinker.java | 2 +- .../java/zingg/spark/core/executor/SparkMatcher.java | 2 +- .../java/zingg/spark/core/executor/SparkPeekModel.java | 2 +- .../zingg/spark/core/executor/SparkRecommender.java | 2 +- .../zingg/spark/core/executor/SparkTrainMatcher.java | 2 +- .../java/zingg/spark/core/executor/SparkTrainer.java | 2 +- .../spark/core/executor/SparkTrainingDataFinder.java | 2 +- 25 files changed, 34 insertions(+), 32 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/IZingg.java b/common/client/src/main/java/zingg/common/client/IZingg.java index 306f4bec8..ec675ec02 100644 --- a/common/client/src/main/java/zingg/common/client/IZingg.java +++ b/common/client/src/main/java/zingg/common/client/IZingg.java @@ -11,7 +11,7 @@ public void init(Arguments args, IZinggLicense license) public void cleanup() throws ZinggClientException; - public ZinggOptions getZinggOptions(); + //public ZinggOptions getZinggOptions(); public String getName(); diff --git a/common/core/src/main/java/zingg/common/core/executor/Documenter.java b/common/core/src/main/java/zingg/common/core/executor/Documenter.java index 6e80b8aa7..f0bc2e21d 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Documenter.java +++ b/common/core/src/main/java/zingg/common/core/executor/Documenter.java @@ -14,7 +14,7 @@ public abstract class Documenter extends ZinggBase { public static final Log LOG = LogFactory.getLog(Documenter.class); public Documenter() { - setZinggOptions(ZinggOptions.GENERATE_DOCS); + //setZinggOptions(ZinggOptions.GENERATE_DOCS); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java index ef21600e3..26d0c1461 100644 --- a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java @@ -17,7 +17,7 @@ public abstract class FindAndLabeller extends ZinggBase labeller; public FindAndLabeller() { - setZinggOptions(ZinggOptions.FIND_AND_LABEL); + //setZinggOptions(ZinggOptions.FIND_AND_LABEL); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java index 14273ba2c..9ac6bac78 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java @@ -21,9 +21,9 @@ public class LabelDataViewHelper extends ZinggBase imp private static final long serialVersionUID = 1L; public static final Log LOG = LogFactory.getLog(LabelDataViewHelper.class); - public LabelDataViewHelper(Context context, ZinggOptions zinggOptions, ClientOptions clientOptions) { + public LabelDataViewHelper(Context context, ClientOptions clientOptions) { setContext(context); - setZinggOptions(zinggOptions); + //setZinggOptions(zinggOptions); setClientOptions(clientOptions); setName(this.getClass().getName()); } diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java index 0615819d4..92ba76953 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java @@ -19,7 +19,7 @@ public abstract class LabelUpdater extends Labeller { public static final Log LOG = LogFactory.getLog(LabelUpdater.class); public LabelUpdater() { - setZinggOptions(ZinggOptions.UPDATE_LABEL); + //setZinggOptions(ZinggOptions.UPDATE_LABEL); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/Labeller.java b/common/core/src/main/java/zingg/common/core/executor/Labeller.java index 7c9575c25..1240be03a 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Labeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/Labeller.java @@ -24,7 +24,7 @@ public abstract class Labeller extends ZinggBase { protected ILabelDataViewHelper labelDataViewHelper; public Labeller() { - setZinggOptions(ZinggOptions.LABEL); + //setZinggOptions(ZinggOptions.LABEL); } public void execute() throws ZinggClientException { @@ -158,7 +158,7 @@ int readCliInput() { @Override public ITrainingDataModel getTrainingDataModel() { if (trainingDataModel==null) { - this.trainingDataModel = new TrainingDataModel(getContext(), getZinggOptions(), getClientOptions()); + this.trainingDataModel = new TrainingDataModel(getContext(), getClientOptions()); } return trainingDataModel; } @@ -170,7 +170,7 @@ public void setTrainingDataModel(ITrainingDataModel trainingDataMode @Override public ILabelDataViewHelper getLabelDataViewHelper() { if(labelDataViewHelper==null) { - labelDataViewHelper = new LabelDataViewHelper(getContext(), getZinggOptions(), getClientOptions()); + labelDataViewHelper = new LabelDataViewHelper(getContext(), getClientOptions()); } return labelDataViewHelper; } diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 797bb59bc..465a4b1f1 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -17,7 +17,7 @@ public abstract class Linker extends Matcher { public static final Log LOG = LogFactory.getLog(Linker.class); public Linker() { - setZinggOptions(ZinggOptions.LINK); + //setZinggOptions(ZinggOptions.LINK); } protected ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 2aae7fea2..41923d66c 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -25,7 +25,7 @@ public abstract class Matcher extends ZinggBase{ public static final Log LOG = LogFactory.getLog(Matcher.class); public Matcher() { - setZinggOptions(ZinggOptions.MATCH); + //setZinggOptions(ZinggOptions.MATCH); } protected ZFrame getTestData() throws ZinggClientException{ diff --git a/common/core/src/main/java/zingg/common/core/executor/Recommender.java b/common/core/src/main/java/zingg/common/core/executor/Recommender.java index 7119a1182..2163b3c10 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Recommender.java +++ b/common/core/src/main/java/zingg/common/core/executor/Recommender.java @@ -13,7 +13,7 @@ public abstract class Recommender extends ZinggBase { public static final Log LOG = LogFactory.getLog(Recommender.class); public Recommender() { - setZinggOptions(ZinggOptions.RECOMMEND); + //setZinggOptions(ZinggOptions.RECOMMEND); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java index 162c033f9..3d9e3cba9 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java @@ -17,7 +17,7 @@ public abstract class TrainMatcher extends ZinggBase{ protected Matcher matcher; public TrainMatcher() { - setZinggOptions(ZinggOptions.TRAIN_MATCH); + //setZinggOptions(ZinggOptions.TRAIN_MATCH); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java index 50502afcf..c26fd0522 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java @@ -20,7 +20,7 @@ public abstract class TrainingDataFinder extends ZinggBase public static final Log LOG = LogFactory.getLog(TrainingDataFinder.class); public TrainingDataFinder() { - setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); + //setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); } public ZFrame getTraining() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java index 4e6424789..c4e145e09 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java @@ -21,9 +21,9 @@ public class TrainingDataModel extends ZinggBase imple private long positivePairsCount, negativePairsCount, notSurePairsCount; private long totalCount; - public TrainingDataModel(Context context, ZinggOptions zinggOptions, ClientOptions clientOptions) { + public TrainingDataModel(Context context, ClientOptions clientOptions) { setContext(context); - setZinggOptions(zinggOptions); + //setZinggOptions(zinggOptions); setClientOptions(clientOptions); setName(this.getClass().getName()); } diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index 45466dd77..118b2716d 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -34,7 +34,7 @@ public abstract class ZinggBase implements Serializable, IZingg context; protected String name; - protected ZinggOptions zinggOptions; + //protected ZinggOptions zinggOptions; protected long startTime; protected ClientOptions clientOptions; @@ -86,7 +86,7 @@ public void postMetrics() { Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics); Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics); - Analytics.postEvent(zinggOptions.getValue(), collectMetrics); + //Analytics.postEvent(zinggOptions.getValue(), collectMetrics); } public Arguments getArgs() { @@ -110,17 +110,19 @@ public void setContext(Context source) { public void setName(String name) { this.name = name; } - public void setZinggOptions(ZinggOptions zinggOptions) { + /*public void setZinggOptions(ZinggOptions zinggOptions) { this.zinggOptions = zinggOptions; } + */ public String getName() { return name; } + /* public ZinggOptions getZinggOptions() { return zinggOptions; - } + }*/ public ZFrame getMarkedRecords() { try { diff --git a/spark/client/src/main/java/zingg/spark/client/SparkClient.java b/spark/client/src/main/java/zingg/spark/client/SparkClient.java index c083e0612..4b8e2ce53 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkClient.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkClient.java @@ -21,7 +21,7 @@ * @author sgoyal * */ -public class SparkClient extends Client, Row, Column, DataType> implements Serializable { +public class SparkClient extends Client, Row, Column, DataType> { private static final long serialVersionUID = 1L; protected static final String zFactoryClassName = "zingg.spark.core.executor.SparkZFactory"; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 7b3ab3ec1..7eaba6e7b 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -26,7 +26,7 @@ public class SparkDocumenter extends Documenter, Row public static final Log LOG = LogFactory.getLog(SparkDocumenter.class); public SparkDocumenter() { - setZinggOptions(ZinggOptions.GENERATE_DOCS); + //setZinggOptions(ZinggOptions.GENERATE_DOCS); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index e4e0db2b9..fa3a8e1c1 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -22,7 +22,7 @@ public class SparkFindAndLabeller extends FindAndLabeller, public static final Log LOG = LogFactory.getLog(SparkLabelUpdater.class); public SparkLabelUpdater() { - setZinggOptions(ZinggOptions.UPDATE_LABEL); + //setZinggOptions(ZinggOptions.UPDATE_LABEL); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index fd38d2e64..20f99f49e 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -30,7 +30,7 @@ public SparkLabeller() { } public SparkLabeller(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.LABEL); + //setZinggOptions(ZinggOptions.LABEL); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index dde7c3633..3f03865ea 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -25,7 +25,7 @@ public class SparkLinker extends Linker, Row, Column public static final Log LOG = LogFactory.getLog(SparkLinker.class); public SparkLinker() { - setZinggOptions(ZinggOptions.LINK); + //setZinggOptions(ZinggOptions.LINK); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index ee6b1145c..db944b091 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -35,7 +35,7 @@ public SparkMatcher() { } public SparkMatcher(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.MATCH); + //setZinggOptions(ZinggOptions.MATCH); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index e12876b04..5deeceab1 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -26,7 +26,7 @@ public class SparkPeekModel extends ZinggBase, Row, public static final Log LOG = LogFactory.getLog(SparkPeekModel.class); public SparkPeekModel() { - setZinggOptions(ZinggOptions.PEEK_MODEL); + //setZinggOptions(ZinggOptions.PEEK_MODEL); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 541dcd297..bec8a3ca3 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -28,7 +28,7 @@ public class SparkRecommender extends Recommender, R public static final Log LOG = LogFactory.getLog(SparkRecommender.class); public SparkRecommender() { - setZinggOptions(ZinggOptions.RECOMMEND); + //setZinggOptions(ZinggOptions.RECOMMEND); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index 86600939e..1a76d8f4d 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -21,7 +21,7 @@ public class SparkTrainMatcher extends TrainMatcher, public static final Log LOG = LogFactory.getLog(SparkTrainMatcher.class); public SparkTrainMatcher() { - setZinggOptions(ZinggOptions.TRAIN_MATCH); + //setZinggOptions(ZinggOptions.TRAIN_MATCH); ZinggSparkContext sparkContext = new ZinggSparkContext(); setContext(sparkContext); trainer = new SparkTrainer(sparkContext); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index 6bad40892..a49c8f0af 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -28,7 +28,7 @@ public SparkTrainer() { } public SparkTrainer(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.TRAIN); + //setZinggOptions(ZinggOptions.TRAIN); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index b2813d4f5..0cc86593e 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -27,7 +27,7 @@ public SparkTrainingDataFinder() { } public SparkTrainingDataFinder(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); + //setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); setContext(sparkContext); } From 59fb016ae6f8c76c9e941c7dce256f086e818761 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sat, 26 Aug 2023 09:12:18 +0530 Subject: [PATCH 004/219] matcher gets invoked in enterprise --- .../zingg/common/core/executor/Linker.java | 6 +-- .../zingg/common/core/executor/Matcher.java | 33 ++++++------ .../java/zingg/spark/client/SparkClient.java | 9 ++-- .../zingg/spark/client/ZSparkSession.java | 39 -------------- .../documenter/SparkDataColDocumenter.java | 6 +-- .../core/documenter/SparkDataDocumenter.java | 6 +-- .../documenter/SparkModelColDocumenter.java | 6 +-- .../core/documenter/SparkModelDocumenter.java | 6 +-- .../spark/core/executor/SparkDocumenter.java | 8 +-- .../core/executor/SparkFindAndLabeller.java | 4 +- .../core/executor/SparkLabelUpdater.java | 4 +- .../spark/core/executor/SparkLabeller.java | 4 +- .../spark/core/executor/SparkLinker.java | 8 +-- .../spark/core/executor/SparkMatcher.java | 8 +-- .../spark/core/executor/SparkPeekModel.java | 4 +- .../spark/core/executor/SparkRecommender.java | 8 +-- .../core/executor/SparkTrainMatcher.java | 4 +- .../spark/core/executor/SparkTrainer.java | 6 +-- .../executor/SparkTrainingDataFinder.java | 6 +-- .../core/executor/ZinggSparkContext.java | 54 +++++++++---------- .../zingg/spark/core/model/SparkModel.java | 6 +-- .../core/model/VectorValueExtractor.java | 6 +-- .../preprocess/SparkStopWordsRemover.java | 10 ++-- .../SparkStopWordsRecommender.java | 6 +-- .../core/similarity/SparkBaseTransformer.java | 4 +- .../core/similarity/SparkTransformer.java | 6 +-- .../core/util/SparkBlockingTreeUtil.java | 10 ++-- .../zingg/spark/core/util/SparkDFReader.java | 8 +-- .../zingg/spark/core/util/SparkDSUtil.java | 14 ++--- .../zingg/spark/core/util/SparkHashUtil.java | 8 +-- .../zingg/spark/core/util/SparkModelUtil.java | 14 ++--- .../zingg/spark/core/util/SparkPipeUtil.java | 10 ++-- .../spark/core/executor/ZinggSparkTester.java | 6 +-- 33 files changed, 148 insertions(+), 189 deletions(-) delete mode 100644 spark/client/src/main/java/zingg/spark/client/ZSparkSession.java diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 465a4b1f1..76b8703d7 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -20,13 +20,13 @@ public Linker() { //setZinggOptions(ZinggOptions.LINK); } - protected ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ + public ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ // THIS LOG IS NEEDED FOR PLAN CALCULATION USING COUNT, DO NOT REMOVE LOG.info("in getBlocks, blocked count is " + blocked.count()); return getDSUtil().joinWithItselfSourceSensitive(blocked, ColName.HASH_COL, args).cache(); } - protected ZFrame selectColsFromBlocked(ZFrame blocked) { + public ZFrame selectColsFromBlocked(ZFrame blocked) { return blocked; } @@ -53,7 +53,7 @@ public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws } } - protected ZFrame getDupesActualForGraph(ZFrame dupes) { + public ZFrame getDupesActualForGraph(ZFrame dupes) { ZFrame dupesActual = dupes .filter(dupes.equalTo(ColName.PREDICTION_COL, ColValues.IS_MATCH_PREDICTION)); return dupesActual; diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 41923d66c..99f6c05ff 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -28,17 +28,17 @@ public Matcher() { //setZinggOptions(ZinggOptions.MATCH); } - protected ZFrame getTestData() throws ZinggClientException{ + public ZFrame getTestData() throws ZinggClientException{ ZFrame data = getPipeUtil().read(true, true, args.getNumPartitions(), true, args.getData()); return data; } - protected ZFrame getFieldDefColumnsDS(ZFrame testDataOriginal) { + public ZFrame getFieldDefColumnsDS(ZFrame testDataOriginal) { return getDSUtil().getFieldDefColumnsDS(testDataOriginal, args, true); } - protected ZFrame getBlocked( ZFrame testData) throws Exception, ZinggClientException{ + public ZFrame getBlocked( ZFrame testData) throws Exception, ZinggClientException{ LOG.debug("Blocking model file location is " + args.getBlockFile()); Tree> tree = getBlockingTreeUtil().readBlockingTree(args); ZFrame blocked = getBlockingTreeUtil().getBlockHashes(testData, tree); @@ -48,11 +48,11 @@ protected ZFrame getBlocked( ZFrame testData) throws Exception, - protected ZFrame getBlocks(ZFrameblocked) throws Exception{ + public ZFrame getBlocks(ZFrameblocked) throws Exception{ return getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); } - protected ZFrame getBlocks(ZFrameblocked, ZFramebAll) throws Exception{ + public ZFrame getBlocks(ZFrameblocked, ZFramebAll) throws Exception{ ZFramejoinH = getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); /*ZFramejoinH = blocked.as("first").joinOnCol(blocked.as("second"), ColName.HASH_COL) .selectExpr("first.z_zid as z_zid", "second.z_zid as z_z_zid"); @@ -74,15 +74,15 @@ protected ZFrame getBlocks(ZFrameblocked, ZFramebAll) throw return joinH; } - protected ZFramemassageAllEquals(ZFrameallEqual) { + public ZFramemassageAllEquals(ZFrameallEqual) { allEqual = allEqual.withColumn(ColName.PREDICTION_COL, ColValues.IS_MATCH_PREDICTION); allEqual = allEqual.withColumn(ColName.SCORE_COL, ColValues.FULL_MATCH_SCORE); return allEqual; } - protected abstract Model getModel() throws ZinggClientException; + public abstract Model getModel() throws ZinggClientException; - protected ZFrameselectColsFromBlocked(ZFrameblocked) { + public ZFrameselectColsFromBlocked(ZFrameblocked) { return blocked.select(ColName.ID_COL, ColName.HASH_COL); } @@ -172,7 +172,8 @@ public void execute() throws ZinggClientException { throw new ZinggClientException(e.getMessage()); } } - protected ZFrame addObvDupes(ZFrame obvDupePairs, ZFrame dupesActual) { + + public ZFrame addObvDupes(ZFrame obvDupePairs, ZFrame dupesActual) { if (obvDupePairs != null) { // ensure same columns in both obvDupePairs = selectColsFromDupes(obvDupePairs); @@ -181,7 +182,7 @@ protected ZFrame addObvDupes(ZFrame obvDupePairs, ZFrame removeObvDupesFromBlocks(ZFrame blocks) { + public ZFrame removeObvDupesFromBlocks(ZFrame blocks) { LOG.info("blocks count before removing obvDupePairs " + blocks.count()); C reverseOBVDupeDFFilter = blocks.getReverseObviousDupesFilter(args.getObviousDupeCondition(),null); if (reverseOBVDupeDFFilter != null) { @@ -192,7 +193,7 @@ protected ZFrame removeObvDupesFromBlocks(ZFrame blocks) { return blocks; } - protected ZFrame getObvDupePairs(ZFrame blocked) { + public ZFrame getObvDupePairs(ZFrame blocked) { String obviousDupeString = args.getObviousDupeCondition(); @@ -286,13 +287,13 @@ public void writeOutput( ZFrame blocked, ZFrame dupesActual) th } - protected ZFrame getGraphWithScores(ZFrame graph, ZFrame score) { + public ZFrame getGraphWithScores(ZFrame graph, ZFrame score) { ZFramegraphWithScores = getDSUtil().joinZColFirst( score, graph, ColName.ID_COL, false).cache(); return graphWithScores; } - protected ZFramegetMinMaxScores(ZFramedupes, ZFramegraph) throws Exception { + public ZFramegetMinMaxScores(ZFramedupes, ZFramegraph) throws Exception { if (LOG.isDebugEnabled()) dupes.show(500); ZFrame graph1 = graph.select(ColName.ID_COL, ColName.CLUSTER_COLUMN); @@ -346,14 +347,14 @@ protected ZFrame getGraphWithScores(ZFrame graph, ZFramegetDupesActualForGraph(ZFramedupes) { + public ZFramegetDupesActualForGraph(ZFramedupes) { dupes = selectColsFromDupes(dupes); LOG.debug("dupes al"); if (LOG.isDebugEnabled()) dupes.show(); return dupes.filter(dupes.equalTo(ColName.PREDICTION_COL,ColValues.IS_MATCH_PREDICTION)); } - protected ZFrameselectColsFromDupes(ZFramedupesActual) { + public ZFrameselectColsFromDupes(ZFramedupesActual) { List cols = new ArrayList(); cols.add(dupesActual.col(ColName.ID_COL)); cols.add(dupesActual.col(ColName.COL_PREFIX + ColName.ID_COL)); @@ -363,6 +364,6 @@ protected ZFrame getGraphWithScores(ZFrame graph, ZFrame getStopWords(); + public abstract StopWordsRemover getStopWords(); } diff --git a/spark/client/src/main/java/zingg/spark/client/SparkClient.java b/spark/client/src/main/java/zingg/spark/client/SparkClient.java index 4b8e2ce53..6aba00458 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkClient.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkClient.java @@ -21,7 +21,7 @@ * @author sgoyal * */ -public class SparkClient extends Client, Row, Column, DataType> { +public class SparkClient extends Client, Row, Column, DataType> { private static final long serialVersionUID = 1L; protected static final String zFactoryClassName = "zingg.spark.core.executor.SparkZFactory"; @@ -32,13 +32,10 @@ public SparkClient(Arguments args, ClientOptions options) throws ZinggClientExce - public SparkClient(Arguments args, ClientOptions options, ZSparkSession s) throws ZinggClientException { + public SparkClient(Arguments args, ClientOptions options, SparkSession s) throws ZinggClientException { super(args, options, s, zFactoryClassName); } - public SparkClient(Arguments args, ClientOptions options, SparkSession s) throws ZinggClientException { - this(args, options, new ZSparkSession(s,null)); - } public SparkClient() { /*SparkSession session = SparkSession @@ -55,7 +52,7 @@ public SparkClient() { @Override - public Client, Row, Column, DataType> getClient(Arguments args, + public Client, Row, Column, DataType> getClient(Arguments args, ClientOptions options) throws ZinggClientException { // TODO Auto-generated method stub SparkClient client = null; diff --git a/spark/client/src/main/java/zingg/spark/client/ZSparkSession.java b/spark/client/src/main/java/zingg/spark/client/ZSparkSession.java deleted file mode 100644 index 756b7e574..000000000 --- a/spark/client/src/main/java/zingg/spark/client/ZSparkSession.java +++ /dev/null @@ -1,39 +0,0 @@ -package zingg.spark.client; -import org.apache.spark.sql.SparkSession; - -import zingg.common.client.ZSession; -import zingg.common.client.license.IZinggLicense; - -public class ZSparkSession implements ZSession { - - private SparkSession session; - - private IZinggLicense license; - - public ZSparkSession(SparkSession session, IZinggLicense license) { - super(); - this.session = session; - this.license = license; - } - - @Override - public SparkSession getSession() { - return session; - } - - @Override - public void setSession(SparkSession session) { - this.session = session; - } - - @Override - public IZinggLicense getLicense() { - return license; - } - - @Override - public void setLicense(IZinggLicense license) { - this.license = license; - } - -} diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java index 09f8e5642..ec6ae2bc2 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java @@ -10,17 +10,17 @@ import zingg.common.core.Context; import zingg.common.core.documenter.DataColDocumenter; import zingg.common.core.documenter.RowWrapper; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; /** * Spark specific implementation of DataColDocumenter * */ -public class SparkDataColDocumenter extends DataColDocumenter, Row, Column,DataType> { +public class SparkDataColDocumenter extends DataColDocumenter, Row, Column,DataType> { private static final long serialVersionUID = 1L; - public SparkDataColDocumenter(Context, Row, Column,DataType> context, Arguments args) { + public SparkDataColDocumenter(Context, Row, Column,DataType> context, Arguments args) { super(context, args); } diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java index ab8a1f32e..78b3e107a 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java @@ -10,17 +10,17 @@ import zingg.common.core.Context; import zingg.common.core.documenter.DataDocumenter; import zingg.common.core.documenter.RowWrapper; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; /** * Spark specific implementation of DataDocumenter * */ -public class SparkDataDocumenter extends DataDocumenter, Row, Column,DataType> { +public class SparkDataDocumenter extends DataDocumenter, Row, Column,DataType> { private static final long serialVersionUID = 1L; - public SparkDataDocumenter(Context, Row, Column,DataType> context, Arguments args) { + public SparkDataDocumenter(Context, Row, Column,DataType> context, Arguments args) { super(context, args); } diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java index 5cd49fb61..1145f9408 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java @@ -10,18 +10,18 @@ import zingg.common.core.Context; import zingg.common.core.documenter.ModelColDocumenter; import zingg.common.core.documenter.RowWrapper; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; /** * Spark specific implementation of ModelColDocumenter * * */ -public class SparkModelColDocumenter extends ModelColDocumenter, Row, Column,DataType> { +public class SparkModelColDocumenter extends ModelColDocumenter, Row, Column,DataType> { private static final long serialVersionUID = 1L; - public SparkModelColDocumenter(Context, Row, Column,DataType> context, Arguments args) { + public SparkModelColDocumenter(Context, Row, Column,DataType> context, Arguments args) { super(context, args); } diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java index 2a210bbb3..e53ce48c4 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java @@ -10,17 +10,17 @@ import zingg.common.core.Context; import zingg.common.core.documenter.ModelDocumenter; import zingg.common.core.documenter.RowWrapper; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; /** * Spark specific implementation of ModelDocumenter * */ -public class SparkModelDocumenter extends ModelDocumenter, Row, Column,DataType> { +public class SparkModelDocumenter extends ModelDocumenter, Row, Column,DataType> { private static final long serialVersionUID = 1L; - public SparkModelDocumenter(Context, Row, Column,DataType> context, Arguments args) { + public SparkModelDocumenter(Context, Row, Column,DataType> context, Arguments args) { super(context, args); super.modelColDoc = new SparkModelColDocumenter(context,args); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 7eaba6e7b..d9ff6c277 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -14,12 +14,12 @@ import zingg.common.core.documenter.DataDocumenter; import zingg.common.core.documenter.ModelDocumenter; import zingg.common.core.executor.Documenter; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.documenter.SparkDataDocumenter; import zingg.spark.core.documenter.SparkModelDocumenter; -public class SparkDocumenter extends Documenter, Row, Column,DataType> { +public class SparkDocumenter extends Documenter, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkDocumenter"; @@ -37,13 +37,13 @@ public void init(Arguments args, IZinggLicense license) throws ZinggClientExcep } @Override - protected ModelDocumenter, Row, Column, DataType> getModelDocumenter() { + protected ModelDocumenter, Row, Column, DataType> getModelDocumenter() { return new SparkModelDocumenter(getContext(),getArgs()); } @Override - protected DataDocumenter, Row, Column, DataType> getDataDocumenter() { + protected DataDocumenter, Row, Column, DataType> getDataDocumenter() { return new SparkDataDocumenter(getContext(),getArgs()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index fa3a8e1c1..c5ad0943a 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -13,9 +13,9 @@ import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; import zingg.common.core.executor.FindAndLabeller; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; -public class SparkFindAndLabeller extends FindAndLabeller, Row, Column,DataType> { +public class SparkFindAndLabeller extends FindAndLabeller, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkFindAndLabeller"; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java index 8713cc0d6..33e44940e 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java @@ -14,7 +14,7 @@ import zingg.common.client.license.IZinggLicense; import zingg.common.client.pipe.Pipe; import zingg.common.core.executor.LabelUpdater; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; /** @@ -22,7 +22,7 @@ * * */ -public class SparkLabelUpdater extends LabelUpdater, Row, Column,DataType> { +public class SparkLabelUpdater extends LabelUpdater, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkLabelUpdater"; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index 20f99f49e..af42b10cd 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -12,14 +12,14 @@ import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; import zingg.common.core.executor.Labeller; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; /** * Spark specific implementation of Labeller * * */ -public class SparkLabeller extends Labeller, Row, Column,DataType> { +public class SparkLabeller extends Labeller, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkLabeller"; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index 3f03865ea..a458044f0 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -14,11 +14,11 @@ import zingg.common.core.executor.Linker; import zingg.common.core.model.Model; import zingg.common.core.preprocess.StopWordsRemover; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.preprocess.SparkStopWordsRemover; -public class SparkLinker extends Linker, Row, Column,DataType> { +public class SparkLinker extends Linker, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkLinker"; @@ -36,14 +36,14 @@ public void init(Arguments args, IZinggLicense license) throws ZinggClientExcep } @Override - protected Model getModel() throws ZinggClientException { + public Model getModel() throws ZinggClientException { Model model = getModelUtil().loadModel(false, args); model.register(getContext().getSession()); return model; } @Override - protected StopWordsRemover, Row, Column, DataType> getStopWords() { + public StopWordsRemover, Row, Column, DataType> getStopWords() { return new SparkStopWordsRemover(getContext(),getArgs()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index db944b091..fde5a6c4e 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -15,7 +15,7 @@ import zingg.common.core.executor.Matcher; import zingg.common.core.model.Model; import zingg.common.core.preprocess.StopWordsRemover; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.preprocess.SparkStopWordsRemover; /** @@ -23,7 +23,7 @@ * * */ -public class SparkMatcher extends Matcher,Row,Column,DataType>{ +public class SparkMatcher extends Matcher,Row,Column,DataType>{ private static final long serialVersionUID = 1L; @@ -47,14 +47,14 @@ public void init(Arguments args, IZinggLicense license) throws ZinggClientExcep @Override - protected Model getModel() throws ZinggClientException { + public Model getModel() throws ZinggClientException { Model model = getModelUtil().loadModel(false, args); model.register(getContext().getSession()); return model; } @Override - protected StopWordsRemover, Row, Column, DataType> getStopWords() { + public StopWordsRemover, Row, Column, DataType> getStopWords() { return new SparkStopWordsRemover(getContext(),getArgs()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index 5deeceab1..659a06d5b 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -17,9 +17,9 @@ import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; import zingg.common.core.executor.ZinggBase; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; -public class SparkPeekModel extends ZinggBase, Row, Column, DataType>{ +public class SparkPeekModel extends ZinggBase, Row, Column, DataType>{ private static final long serialVersionUID = 1L; protected static String name = "zingg.spark.core.executor.SparkPeekModel"; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index bec8a3ca3..3501380e2 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -13,7 +13,7 @@ import zingg.common.client.license.IZinggLicense; import zingg.common.core.executor.Recommender; import zingg.common.core.recommender.StopWordsRecommender; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.recommender.SparkStopWordsRecommender; @@ -21,7 +21,7 @@ * Spark specific implementation of Recommender * */ -public class SparkRecommender extends Recommender, Row, Column,DataType> { +public class SparkRecommender extends Recommender, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkRecommender"; @@ -39,8 +39,8 @@ public void init(Arguments args, IZinggLicense license) throws ZinggClientExcep } @Override - public StopWordsRecommender, Row, Column, DataType> getStopWordsRecommender() { - StopWordsRecommender, Row, Column, DataType> stopWordsRecommender = new SparkStopWordsRecommender(getContext(),args); + public StopWordsRecommender, Row, Column, DataType> getStopWordsRecommender() { + StopWordsRecommender, Row, Column, DataType> stopWordsRecommender = new SparkStopWordsRecommender(getContext(),args); return stopWordsRecommender; } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index 1a76d8f4d..7459c0b15 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -12,9 +12,9 @@ import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; import zingg.common.core.executor.TrainMatcher; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; -public class SparkTrainMatcher extends TrainMatcher, Row, Column,DataType> { +public class SparkTrainMatcher extends TrainMatcher, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkTrainMatcher"; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index a49c8f0af..2735db9d8 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -13,11 +13,11 @@ import zingg.common.client.license.IZinggLicense; import zingg.common.core.executor.Trainer; import zingg.common.core.preprocess.StopWordsRemover; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.preprocess.SparkStopWordsRemover; -public class SparkTrainer extends Trainer, Row, Column,DataType> { +public class SparkTrainer extends Trainer, Row, Column,DataType> { public static String name = "zingg.spark.core.executor.SparkTrainer"; private static final long serialVersionUID = 1L; @@ -39,7 +39,7 @@ public void init(Arguments args, IZinggLicense license) throws ZinggClientExcep } @Override - protected StopWordsRemover, Row, Column, DataType> getStopWords() { + protected StopWordsRemover, Row, Column, DataType> getStopWords() { return new SparkStopWordsRemover(getContext(),getArgs()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index 0cc86593e..532a69875 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -13,10 +13,10 @@ import zingg.common.client.license.IZinggLicense; import zingg.common.core.executor.TrainingDataFinder; import zingg.common.core.preprocess.StopWordsRemover; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.preprocess.SparkStopWordsRemover; -public class SparkTrainingDataFinder extends TrainingDataFinder, Row, Column,DataType> { +public class SparkTrainingDataFinder extends TrainingDataFinder, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.core.executor.SparkTrainingDataFinder"; @@ -38,7 +38,7 @@ public void init(Arguments args, IZinggLicense license) throws ZinggClientExcep } @Override - protected StopWordsRemover, Row, Column, DataType> getStopWords() { + protected StopWordsRemover, Row, Column, DataType> getStopWords() { return new SparkStopWordsRemover(getContext(),getArgs()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java b/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java index bf28e5fb3..bfc86dafd 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java @@ -19,7 +19,7 @@ import zingg.common.core.util.HashUtil; import zingg.common.core.util.ModelUtil; import zingg.common.core.util.PipeUtilBase; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.util.SparkBlockingTreeUtil; import zingg.spark.core.util.SparkDSUtil; import zingg.spark.core.util.SparkGraphUtil; @@ -28,18 +28,18 @@ import zingg.spark.core.util.SparkPipeUtil; -public class ZinggSparkContext implements Context, Row,Column,DataType>{ +public class ZinggSparkContext implements Context, Row,Column,DataType>{ private static final long serialVersionUID = 1L; protected JavaSparkContext ctx; - protected ZSparkSession zSession; - protected PipeUtilBase, Row, Column> pipeUtil; - protected HashUtil, Row, Column, DataType> hashUtil; - protected DSUtil, Row, Column> dsUtil; + protected SparkSession zSession; + protected PipeUtilBase, Row, Column> pipeUtil; + protected HashUtil, Row, Column, DataType> hashUtil; + protected DSUtil, Row, Column> dsUtil; protected GraphUtil, Row, Column> graphUtil; - protected ModelUtil, Row, Column> modelUtil; - protected BlockingTreeUtil, Row, Column, DataType> blockingTreeUtil; + protected ModelUtil, Row, Column> modelUtil; + protected BlockingTreeUtil, Row, Column, DataType> blockingTreeUtil; public static final String hashFunctionFile = "hashFunctions.json"; @@ -47,11 +47,11 @@ public class ZinggSparkContext implements Context, R public static final Log LOG = LogFactory.getLog(ZinggSparkContext.class); - public ZSparkSession getSession() { + public SparkSession getSession() { return zSession; } - public void setSession(ZSparkSession spark) { + public void setSession(SparkSession spark) { LOG.debug("Session passed to context is " + spark); this.zSession = spark; } @@ -62,16 +62,16 @@ public void setSession(ZSparkSession spark) { public void init(IZinggLicense license) throws ZinggClientException { try{ - if (zSession==null || zSession.getSession() == null) { - SparkSession spark = SparkSession + if (zSession==null) { + zSession = SparkSession .builder() .appName("Zingg") .getOrCreate(); - zSession = new ZSparkSession(spark, license); + //zSession = new SparkSession(spark, license); } if (ctx==null) { - ctx = JavaSparkContext.fromSparkContext(zSession.getSession().sparkContext()); + ctx = JavaSparkContext.fromSparkContext(zSession.sparkContext()); JavaSparkContext.jarOfClass(IZingg.class); LOG.debug("Context " + ctx.toString()); //initHashFns(); @@ -91,8 +91,8 @@ public void cleanup() { if (ctx != null) { ctx.stop(); } - if (zSession!=null && zSession.getSession() != null) { - zSession.getSession().stop(); + if (zSession!=null) { + zSession.stop(); } ctx = null; zSession = null; @@ -104,7 +104,7 @@ public void cleanup() { @Override public void setUtils() { - LOG.debug("Session passed to utils is " + zSession.getSession()); + LOG.debug("Session passed to utils is " + zSession); setPipeUtil(new SparkPipeUtil(zSession)); setDSUtil(new SparkDSUtil(zSession)); setHashUtil(new SparkHashUtil(zSession)); @@ -127,7 +127,7 @@ public void initHashFns() throws ZinggClientException { - public void setHashUtil(HashUtil, Row, Column, DataType> t) { + public void setHashUtil(HashUtil, Row, Column, DataType> t) { this.hashUtil = t; } @@ -137,24 +137,24 @@ public void setGraphUtil(GraphUtil, Row, Column> t) { - public void setPipeUtil(PipeUtilBase, Row, Column> pipeUtil) { + public void setPipeUtil(PipeUtilBase, Row, Column> pipeUtil) { this.pipeUtil = pipeUtil; } - public void setDSUtil(DSUtil, Row, Column> pipeUtil) { + public void setDSUtil(DSUtil, Row, Column> pipeUtil) { this.dsUtil = pipeUtil; } - public void setBlockingTreeUtil(BlockingTreeUtil, Row, Column, DataType> d) { + public void setBlockingTreeUtil(BlockingTreeUtil, Row, Column, DataType> d) { this.blockingTreeUtil = d; } - public void setModelUtil(ModelUtil, Row, Column> t) { + public void setModelUtil(ModelUtil, Row, Column> t) { this.modelUtil = t; } - public ModelUtil, Row, Column> getModelUtil() { + public ModelUtil, Row, Column> getModelUtil() { return modelUtil; } @@ -165,7 +165,7 @@ public void setSession(SparkSession session) { */ @Override - public HashUtil, Row, Column, DataType> getHashUtil() { + public HashUtil, Row, Column, DataType> getHashUtil() { return hashUtil; } @@ -175,17 +175,17 @@ public GraphUtil, Row, Column> getGraphUtil() { } @Override - public DSUtil, Row, Column> getDSUtil() { + public DSUtil, Row, Column> getDSUtil() { return dsUtil; } @Override - public PipeUtilBase, Row, Column> getPipeUtil() { + public PipeUtilBase, Row, Column> getPipeUtil() { return pipeUtil; } @Override - public BlockingTreeUtil, Row, Column, DataType> getBlockingTreeUtil() { + public BlockingTreeUtil, Row, Column, DataType> getBlockingTreeUtil() { return blockingTreeUtil; } diff --git a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java index 2c60d1e74..72a17763f 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java +++ b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java @@ -30,11 +30,11 @@ import zingg.common.core.model.Model; import zingg.common.core.similarity.function.SimFunction; import zingg.spark.client.SparkFrame; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.similarity.SparkSimFunction; import zingg.spark.core.similarity.SparkTransformer; -public class SparkModel extends Model, Row, Column>{ +public class SparkModel extends Model, Row, Column>{ public static final Log LOG = LogFactory.getLog(SparkModel.class); public static final Log DbLOG = LogFactory.getLog("WEB"); @@ -171,7 +171,7 @@ public ZFrame,Row,Column> transform(ZFrame,Row,Column> @Override - public void register(ZSparkSession spark) { + public void register(SparkSession spark) { if (featureCreators != null) { for (SparkTransformer bsf: featureCreators) { bsf.register(spark); diff --git a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java index f6ab7486a..0227964e4 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java +++ b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.api.java.UDF1; import org.apache.spark.sql.types.DataTypes; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.similarity.SparkBaseTransformer; public class VectorValueExtractor extends SparkBaseTransformer implements UDF1{ @@ -25,8 +25,8 @@ public Double call(Vector v) { } @Override - public void register(ZSparkSession spark) { - spark.getSession().udf().register(uid, (UDF1) this, DataTypes.DoubleType); + public void register(SparkSession spark) { + spark.udf().register(uid, (UDF1) this, DataTypes.DoubleType); } /*@Override diff --git a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java index bc53b4652..a2f3fe666 100644 --- a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java +++ b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java @@ -18,9 +18,9 @@ import zingg.common.core.Context; import zingg.common.core.preprocess.StopWordsRemover; import zingg.spark.client.SparkFrame; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; -public class SparkStopWordsRemover extends StopWordsRemover,Row,Column,DataType> implements Serializable { +public class SparkStopWordsRemover extends StopWordsRemover,Row,Column,DataType> implements Serializable { private static final long serialVersionUID = 1L; protected static String name = "zingg.spark.preprocess.SparkStopWordsRemover"; @@ -28,7 +28,7 @@ public class SparkStopWordsRemover extends StopWordsRemover, Row, Column,DataType> context,Arguments args) { + public SparkStopWordsRemover(Context, Row, Column,DataType> context,Arguments args) { super(context,args); this.udfName = registerUDF(); } @@ -45,8 +45,8 @@ protected String registerUDF() { // Each field will have different pattern String udfName = removeStopWordsUDF.getName(); // register the UDF - ZSparkSession zSession = getContext().getSession(); - zSession.getSession().udf().register(udfName, removeStopWordsUDF, DataTypes.StringType); + SparkSession zSession = getContext().getSession(); + zSession.udf().register(udfName, removeStopWordsUDF, DataTypes.StringType); return udfName; } diff --git a/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java b/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java index b7fec9a99..21f793d3f 100644 --- a/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.core.Context; import zingg.common.core.recommender.StopWordsRecommender; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; /** @@ -18,13 +18,13 @@ * * */ -public class SparkStopWordsRecommender extends StopWordsRecommender, Row, Column,DataType> { +public class SparkStopWordsRecommender extends StopWordsRecommender, Row, Column,DataType> { private static final long serialVersionUID = 1L; public static String name = "zingg.spark.SparkStopWordsRecommender"; public static final Log LOG = LogFactory.getLog(SparkStopWordsRecommender.class); - public SparkStopWordsRecommender(Context, Row, Column,DataType> context,Arguments args) { + public SparkStopWordsRecommender(Context, Row, Column,DataType> context,Arguments args) { super(context,args); } diff --git a/spark/core/src/main/java/zingg/spark/core/similarity/SparkBaseTransformer.java b/spark/core/src/main/java/zingg/spark/core/similarity/SparkBaseTransformer.java index 5773bad9b..b7dd56a1e 100644 --- a/spark/core/src/main/java/zingg/spark/core/similarity/SparkBaseTransformer.java +++ b/spark/core/src/main/java/zingg/spark/core/similarity/SparkBaseTransformer.java @@ -14,7 +14,7 @@ import org.apache.spark.sql.types.StructType; import zingg.common.client.util.ColName; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; public abstract class SparkBaseTransformer extends Transformer implements HasInputCol, HasOutputCol { @@ -113,6 +113,6 @@ public Param outputCol() { - public abstract void register(ZSparkSession spark); + public abstract void register(SparkSession spark); } diff --git a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java index f24533306..dc6255ca2 100644 --- a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java +++ b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java @@ -5,7 +5,7 @@ import org.apache.spark.sql.api.java.UDF2; import org.apache.spark.sql.types.DataTypes; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; public class SparkTransformer extends SparkBaseTransformer { @@ -25,8 +25,8 @@ public SparkTransformer(String inputCol, SparkSimFunction function, String outpu - public void register(ZSparkSession spark) { - spark.getSession().udf().register(getUid(), (UDF2) function, DataTypes.DoubleType); + public void register(SparkSession spark) { + spark.udf().register(getUid(), (UDF2) function, DataTypes.DoubleType); } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java index 03cb286cd..2cc13a4d9 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java @@ -26,16 +26,16 @@ import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.PipeUtilBase; import zingg.spark.client.SparkFrame; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.block.SparkBlock; import zingg.spark.core.block.SparkBlockFunction; -public class SparkBlockingTreeUtil extends BlockingTreeUtil, Row, Column, DataType>{ +public class SparkBlockingTreeUtil extends BlockingTreeUtil, Row, Column, DataType>{ public static final Log LOG = LogFactory.getLog(SparkBlockingTreeUtil.class); - protected ZSparkSession spark; + protected SparkSession spark; - public SparkBlockingTreeUtil(ZSparkSession s, PipeUtilBase pipeUtil) { + public SparkBlockingTreeUtil(SparkSession s, PipeUtilBase pipeUtil) { this.spark = s; setPipeUtil(pipeUtil); } @@ -62,7 +62,7 @@ public ZFrame, Row, Column> getTreeDF(byte[] blockingTree){ StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("BlockingTree", DataTypes.BinaryType, false) }); List objList = new ArrayList<>(); objList.add(RowFactory.create(blockingTree)); - Dataset df = spark.getSession().sqlContext().createDataFrame(objList, schema).toDF().coalesce(1); + Dataset df = spark.sqlContext().createDataFrame(objList, schema).toDF().coalesce(1); return new SparkFrame(df); } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkDFReader.java b/spark/core/src/main/java/zingg/spark/core/util/SparkDFReader.java index 127d29e2a..eeab194c4 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkDFReader.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkDFReader.java @@ -10,16 +10,16 @@ import zingg.common.client.ZinggClientException; import zingg.common.core.util.DFReader; import zingg.spark.client.SparkFrame; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; public class SparkDFReader implements DFReader, Row, Column> { - private ZSparkSession session; + private SparkSession session; private DataFrameReader reader; - public SparkDFReader(ZSparkSession s) { + public SparkDFReader(SparkSession s) { this.session = s; - this.reader = s.getSession().read(); + this.reader = s.read(); } public DFReader, Row, Column> getReader() { diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkDSUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkDSUtil.java index 1ed2cf265..939b3b87d 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkDSUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkDSUtil.java @@ -10,11 +10,11 @@ import zingg.common.core.util.DSUtil; import zingg.scala.DFUtil; import zingg.spark.client.SparkFrame; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; -public class SparkDSUtil extends DSUtil, Row, Column>{ +public class SparkDSUtil extends DSUtil, Row, Column>{ - public SparkDSUtil(ZSparkSession s) { + public SparkDSUtil(SparkSession s) { super(s); //TODO Auto-generated constructor stub } @@ -28,16 +28,16 @@ public SparkDSUtil(ZSparkSession s) { @Override public ZFrame, Row, Column> addClusterRowNumber(ZFrame, Row, Column> ds) { - ZSparkSession zSparkSession = getSession(); - return new SparkFrame(DFUtil.addClusterRowNumber(((Dataset)ds.df()), zSparkSession.getSession())); + SparkSession sparkSession = getSession(); + return new SparkFrame(DFUtil.addClusterRowNumber(((Dataset)ds.df()), sparkSession)); } @Override public ZFrame, Row, Column> addRowNumber(ZFrame, Row, Column> ds) { - ZSparkSession zSparkSession = getSession(); - return new SparkFrame(DFUtil.addRowNumber(((Dataset)ds.df()), zSparkSession.getSession())); + SparkSession SparkSession = getSession(); + return new SparkFrame(DFUtil.addRowNumber(((Dataset)ds.df()), getSession())); } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java index c128dc888..fcaa48a77 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java @@ -9,19 +9,19 @@ import zingg.common.core.hash.HashFnFromConf; import zingg.common.core.hash.HashFunction; import zingg.common.core.util.BaseHashUtil; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.hash.SparkHashFunctionRegistry; -public class SparkHashUtil extends BaseHashUtil, Row, Column,DataType>{ +public class SparkHashUtil extends BaseHashUtil, Row, Column,DataType>{ - public SparkHashUtil(ZSparkSession spark) { + public SparkHashUtil(SparkSession spark) { super(spark); } public HashFunction, Row, Column,DataType> registerHashFunction(HashFnFromConf scriptArg) { HashFunction, Row, Column,DataType> fn = new SparkHashFunctionRegistry().getFunction(scriptArg.getName()); - getSessionObj().getSession().udf().register(fn.getName(), (UDF1) fn, fn.getReturnType()); + getSessionObj().udf().register(fn.getName(), (UDF1) fn, fn.getReturnType()); return fn; } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java index a3a01679e..5967e7c72 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java @@ -11,23 +11,23 @@ import zingg.common.core.feature.FeatureFactory; import zingg.common.core.model.Model; import zingg.common.core.util.ModelUtil; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.feature.SparkFeatureFactory; import zingg.spark.core.model.SparkLabelModel; import zingg.spark.core.model.SparkModel; -public class SparkModelUtil extends ModelUtil, Row, Column> { +public class SparkModelUtil extends ModelUtil, Row, Column> { public static final Log LOG = LogFactory.getLog(SparkModelUtil.class); - public SparkModelUtil(ZSparkSession s) { + public SparkModelUtil(SparkSession s) { this.session = s; } - public Model, Row, Column> getModel(boolean isLabel, Arguments args) throws ZinggClientException{ - Model, Row, Column> model = null; + public Model, Row, Column> getModel(boolean isLabel, Arguments args) throws ZinggClientException{ + Model, Row, Column> model = null; if (isLabel) { model = new SparkLabelModel(getFeaturers(args)); } @@ -38,9 +38,9 @@ public Model, Row, Column> getModel(boolean } @Override - public Model, Row, Column> loadModel(boolean isLabel, + public Model, Row, Column> loadModel(boolean isLabel, Arguments args) throws ZinggClientException { - Model, Row, Column> model = getModel(isLabel, args); + Model, Row, Column> model = getModel(isLabel, args); model.load(args.getModel()); return model; } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkPipeUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkPipeUtil.java index 999f549c9..3c4dd617f 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkPipeUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkPipeUtil.java @@ -14,29 +14,29 @@ import zingg.common.core.util.DFWriter; import zingg.common.core.util.PipeUtil; import zingg.spark.client.SparkFrame; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; //import com.datastax.spark.connector.cql.*; //import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL; //import zingg.scala.DFUtil; -public class SparkPipeUtil extends PipeUtil, Row, Column>{ +public class SparkPipeUtil extends PipeUtil, Row, Column>{ public final Log LOG = LogFactory.getLog(SparkPipeUtil.class); //private SparkDFReader reader; - public SparkPipeUtil(ZSparkSession spark) { + public SparkPipeUtil(SparkSession spark) { super(spark); } - public ZSparkSession getSession(){ + public SparkSession getSession(){ return this.session; } - public void setSession(ZSparkSession session){ + public void setSession(SparkSession session){ this.session = session; } diff --git a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java index 354867018..4b72130c2 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java @@ -18,7 +18,7 @@ import zingg.common.client.Arguments; import zingg.common.client.IZingg; -import zingg.spark.client.ZSparkSession; +import org.apache.spark.sql.SparkSession; import zingg.spark.core.util.SparkBlockingTreeUtil; import zingg.spark.core.util.SparkDSUtil; import zingg.spark.core.util.SparkGraphUtil; @@ -32,7 +32,7 @@ public class ZinggSparkTester { public static JavaSparkContext ctx; public static SparkSession spark; public static ZinggSparkContext zsCTX; - public static ZSparkSession zSession; + public static SparkSession zSession; public static final Log LOG = LogFactory.getLog(ZinggSparkTester.class); @@ -52,7 +52,7 @@ public static void setup() { args = new Arguments(); zsCTX = new ZinggSparkContext(); zsCTX.ctx = ctx; - zSession = new ZSparkSession(spark, null); + zSession = new SparkSession(spark, null); zsCTX.zSession = zSession; ctx.setCheckpointDir("/tmp/checkpoint"); From da4057fa6c9f08176c1a4decb192d36d3b997002 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sun, 27 Aug 2023 15:03:16 +0530 Subject: [PATCH 005/219] matcher works in enterprise --- .../main/java/zingg/common/client/ZSession.java | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 common/client/src/main/java/zingg/common/client/ZSession.java diff --git a/common/client/src/main/java/zingg/common/client/ZSession.java b/common/client/src/main/java/zingg/common/client/ZSession.java deleted file mode 100644 index 1b778bad7..000000000 --- a/common/client/src/main/java/zingg/common/client/ZSession.java +++ /dev/null @@ -1,16 +0,0 @@ -package zingg.common.client; - -import zingg.common.client.license.IZinggLicense; - -public interface ZSession { - - public S getSession(); - - public void setSession(S session); - - public IZinggLicense getLicense(); - - public void setLicense(IZinggLicense license); - - -} From 3938fb832c481a9f7dd97249adee496797971c7c Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Mon, 28 Aug 2023 15:08:53 +0530 Subject: [PATCH 006/219] added zingg options --- .../main/java/zingg/common/core/executor/Documenter.java | 2 +- .../java/zingg/common/core/executor/FindAndLabeller.java | 2 +- .../zingg/common/core/executor/LabelDataViewHelper.java | 4 +++- .../java/zingg/common/core/executor/LabelUpdater.java | 2 +- .../main/java/zingg/common/core/executor/Labeller.java | 2 +- .../src/main/java/zingg/common/core/executor/Linker.java | 2 +- .../main/java/zingg/common/core/executor/Matcher.java | 2 +- .../java/zingg/common/core/executor/Recommender.java | 2 +- .../java/zingg/common/core/executor/TrainMatcher.java | 2 +- .../zingg/common/core/executor/TrainingDataFinder.java | 2 +- .../zingg/common/core/executor/TrainingDataModel.java | 3 ++- .../main/java/zingg/common/core/executor/ZinggBase.java | 9 +++++---- .../java/zingg/spark/core/executor/SparkDocumenter.java | 2 +- .../zingg/spark/core/executor/SparkFindAndLabeller.java | 2 +- .../zingg/spark/core/executor/SparkLabelUpdater.java | 2 +- .../java/zingg/spark/core/executor/SparkLabeller.java | 2 +- .../main/java/zingg/spark/core/executor/SparkLinker.java | 2 +- .../java/zingg/spark/core/executor/SparkMatcher.java | 2 +- .../java/zingg/spark/core/executor/SparkPeekModel.java | 2 +- .../java/zingg/spark/core/executor/SparkRecommender.java | 2 +- .../zingg/spark/core/executor/SparkTrainMatcher.java | 2 +- .../java/zingg/spark/core/executor/SparkTrainer.java | 2 +- .../spark/core/executor/SparkTrainingDataFinder.java | 2 +- 23 files changed, 30 insertions(+), 26 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Documenter.java b/common/core/src/main/java/zingg/common/core/executor/Documenter.java index f0bc2e21d..6e80b8aa7 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Documenter.java +++ b/common/core/src/main/java/zingg/common/core/executor/Documenter.java @@ -14,7 +14,7 @@ public abstract class Documenter extends ZinggBase { public static final Log LOG = LogFactory.getLog(Documenter.class); public Documenter() { - //setZinggOptions(ZinggOptions.GENERATE_DOCS); + setZinggOptions(ZinggOptions.GENERATE_DOCS); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java index 26d0c1461..ef21600e3 100644 --- a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java @@ -17,7 +17,7 @@ public abstract class FindAndLabeller extends ZinggBase labeller; public FindAndLabeller() { - //setZinggOptions(ZinggOptions.FIND_AND_LABEL); + setZinggOptions(ZinggOptions.FIND_AND_LABEL); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java index 9ac6bac78..65a650a40 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java @@ -23,7 +23,7 @@ public class LabelDataViewHelper extends ZinggBase imp public LabelDataViewHelper(Context context, ClientOptions clientOptions) { setContext(context); - //setZinggOptions(zinggOptions); + setZinggOptions(zinggOptions); setClientOptions(clientOptions); setName(this.getClass().getName()); } @@ -127,5 +127,7 @@ public void execute() throws ZinggClientException { public ILabelDataViewHelper getLabelDataViewHelper() throws UnsupportedOperationException { return this; } + + } diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java index 92ba76953..0615819d4 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java @@ -19,7 +19,7 @@ public abstract class LabelUpdater extends Labeller { public static final Log LOG = LogFactory.getLog(LabelUpdater.class); public LabelUpdater() { - //setZinggOptions(ZinggOptions.UPDATE_LABEL); + setZinggOptions(ZinggOptions.UPDATE_LABEL); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/Labeller.java b/common/core/src/main/java/zingg/common/core/executor/Labeller.java index 1240be03a..cc8cda8c4 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Labeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/Labeller.java @@ -24,7 +24,7 @@ public abstract class Labeller extends ZinggBase { protected ILabelDataViewHelper labelDataViewHelper; public Labeller() { - //setZinggOptions(ZinggOptions.LABEL); + setZinggOptions(ZinggOptions.LABEL); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 76b8703d7..d28c8fef8 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -17,7 +17,7 @@ public abstract class Linker extends Matcher { public static final Log LOG = LogFactory.getLog(Linker.class); public Linker() { - //setZinggOptions(ZinggOptions.LINK); + setZinggOptions(ZinggOptions.LINK); } public ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 99f6c05ff..aef6ea486 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -25,7 +25,7 @@ public abstract class Matcher extends ZinggBase{ public static final Log LOG = LogFactory.getLog(Matcher.class); public Matcher() { - //setZinggOptions(ZinggOptions.MATCH); + setZinggOptions(ZinggOptions.MATCH); } public ZFrame getTestData() throws ZinggClientException{ diff --git a/common/core/src/main/java/zingg/common/core/executor/Recommender.java b/common/core/src/main/java/zingg/common/core/executor/Recommender.java index 2163b3c10..7119a1182 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Recommender.java +++ b/common/core/src/main/java/zingg/common/core/executor/Recommender.java @@ -13,7 +13,7 @@ public abstract class Recommender extends ZinggBase { public static final Log LOG = LogFactory.getLog(Recommender.class); public Recommender() { - //setZinggOptions(ZinggOptions.RECOMMEND); + setZinggOptions(ZinggOptions.RECOMMEND); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java index 3d9e3cba9..162c033f9 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java @@ -17,7 +17,7 @@ public abstract class TrainMatcher extends ZinggBase{ protected Matcher matcher; public TrainMatcher() { - //setZinggOptions(ZinggOptions.TRAIN_MATCH); + setZinggOptions(ZinggOptions.TRAIN_MATCH); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java index c26fd0522..50502afcf 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java @@ -20,7 +20,7 @@ public abstract class TrainingDataFinder extends ZinggBase public static final Log LOG = LogFactory.getLog(TrainingDataFinder.class); public TrainingDataFinder() { - //setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); + setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); } public ZFrame getTraining() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java index c4e145e09..cd5fe1ebd 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java @@ -23,7 +23,7 @@ public class TrainingDataModel extends ZinggBase imple public TrainingDataModel(Context context, ClientOptions clientOptions) { setContext(context); - //setZinggOptions(zinggOptions); + setZinggOptions(zinggOptions); setClientOptions(clientOptions); setName(this.getClass().getName()); } @@ -120,4 +120,5 @@ public long getTotalCount() { + } diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index 118b2716d..b58742d61 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -34,7 +34,7 @@ public abstract class ZinggBase implements Serializable, IZingg context; protected String name; - //protected ZinggOptions zinggOptions; + protected ZinggOptions zinggOptions; protected long startTime; protected ClientOptions clientOptions; @@ -86,7 +86,7 @@ public void postMetrics() { Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics); Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics); - //Analytics.postEvent(zinggOptions.getValue(), collectMetrics); + Analytics.postEvent(zinggOptions.getValue(), collectMetrics); } public Arguments getArgs() { @@ -110,10 +110,11 @@ public void setContext(Context source) { public void setName(String name) { this.name = name; } - /*public void setZinggOptions(ZinggOptions zinggOptions) { + + public void setZinggOptions(ZinggOptions zinggOptions) { this.zinggOptions = zinggOptions; } - */ + public String getName() { return name; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index d9ff6c277..80ff364ae 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -26,7 +26,7 @@ public class SparkDocumenter extends Documenter, Row, public static final Log LOG = LogFactory.getLog(SparkDocumenter.class); public SparkDocumenter() { - //setZinggOptions(ZinggOptions.GENERATE_DOCS); + setZinggOptions(ZinggOptions.GENERATE_DOCS); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index c5ad0943a..b3eec4f77 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -22,7 +22,7 @@ public class SparkFindAndLabeller extends FindAndLabeller, public static final Log LOG = LogFactory.getLog(SparkLabelUpdater.class); public SparkLabelUpdater() { - //setZinggOptions(ZinggOptions.UPDATE_LABEL); + setZinggOptions(ZinggOptions.UPDATE_LABEL); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index af42b10cd..395ad27ee 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -30,7 +30,7 @@ public SparkLabeller() { } public SparkLabeller(ZinggSparkContext sparkContext) { - //setZinggOptions(ZinggOptions.LABEL); + setZinggOptions(ZinggOptions.LABEL); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index a458044f0..eb90d0bd3 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -25,7 +25,7 @@ public class SparkLinker extends Linker, Row, Column, public static final Log LOG = LogFactory.getLog(SparkLinker.class); public SparkLinker() { - //setZinggOptions(ZinggOptions.LINK); + setZinggOptions(ZinggOptions.LINK); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index fde5a6c4e..f99caeeab 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -35,7 +35,7 @@ public SparkMatcher() { } public SparkMatcher(ZinggSparkContext sparkContext) { - //setZinggOptions(ZinggOptions.MATCH); + setZinggOptions(ZinggOptions.MATCH); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index 659a06d5b..e63f942f2 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -26,7 +26,7 @@ public class SparkPeekModel extends ZinggBase, Row, C public static final Log LOG = LogFactory.getLog(SparkPeekModel.class); public SparkPeekModel() { - //setZinggOptions(ZinggOptions.PEEK_MODEL); + setZinggOptions(ZinggOptions.PEEK_MODEL); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 3501380e2..2fdf029c5 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -28,7 +28,7 @@ public class SparkRecommender extends Recommender, Ro public static final Log LOG = LogFactory.getLog(SparkRecommender.class); public SparkRecommender() { - //setZinggOptions(ZinggOptions.RECOMMEND); + setZinggOptions(ZinggOptions.RECOMMEND); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index 7459c0b15..e0a7e54b3 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -21,7 +21,7 @@ public class SparkTrainMatcher extends TrainMatcher, public static final Log LOG = LogFactory.getLog(SparkTrainMatcher.class); public SparkTrainMatcher() { - //setZinggOptions(ZinggOptions.TRAIN_MATCH); + setZinggOptions(ZinggOptions.TRAIN_MATCH); ZinggSparkContext sparkContext = new ZinggSparkContext(); setContext(sparkContext); trainer = new SparkTrainer(sparkContext); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index 2735db9d8..4b770f3a2 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -28,7 +28,7 @@ public SparkTrainer() { } public SparkTrainer(ZinggSparkContext sparkContext) { - //setZinggOptions(ZinggOptions.TRAIN); + setZinggOptions(ZinggOptions.TRAIN); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index 532a69875..f38a1edd7 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -27,7 +27,7 @@ public SparkTrainingDataFinder() { } public SparkTrainingDataFinder(ZinggSparkContext sparkContext) { - //setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); + setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); setContext(sparkContext); } From 2347cf0e0a087553fdcc0ca11aaa6781d654cdda Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Mon, 28 Aug 2023 16:26:19 +0530 Subject: [PATCH 007/219] resolver removed from oss --- .../main/java/zingg/common/client/Client.java | 2 +- .../zingg/common/client/ZinggOptions.java | 63 ++++++++++--------- .../zingg/common/core/executor/ZinggBase.java | 2 +- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 643c6cae7..76242050f 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -89,7 +89,7 @@ public void setZingg(Arguments args, ClientOptions options) throws Exception{ catch(Exception e) { e.printStackTrace(); //set default - setZingg(zf.get(ZinggOptions.getByValue(ZinggOptions.PEEK_MODEL.getValue()))); + setZingg(zf.get(ZinggOptions.getByValue(ZinggOptions.PEEK_MODEL.getName()))); } } diff --git a/common/client/src/main/java/zingg/common/client/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/ZinggOptions.java index e4d1ef959..8a3ae43a7 100644 --- a/common/client/src/main/java/zingg/common/client/ZinggOptions.java +++ b/common/client/src/main/java/zingg/common/client/ZinggOptions.java @@ -1,47 +1,54 @@ package zingg.common.client; +import java.util.HashMap; +import java.util.Map; + import zingg.common.client.util.Util; -public enum ZinggOptions { - - TRAIN("train"), - MATCH("match"), - TRAIN_MATCH("trainMatch"), - FIND_TRAINING_DATA("findTrainingData"), - LABEL("label"), - LINK("link"), - GENERATE_DOCS("generateDocs"), - RECOMMEND("recommend"), - UPDATE_LABEL("updateLabel"), - FIND_AND_LABEL("findAndLabel"), - ASSESS_MODEL("assessModel"), - PEEK_MODEL("peekModel"), - EXPORT_MODEL("exportModel"), - RESOLVE("resolve"); - - private String value; - - ZinggOptions(String s) { - this.value = s; +public class ZinggOptions { + + public final static ZinggOptions TRAIN = new ZinggOptions("train"); + public final static ZinggOptions MATCH = new ZinggOptions("match"); + public final static ZinggOptions TRAIN_MATCH = new ZinggOptions("trainMatch"); + public final static ZinggOptions FIND_TRAINING_DATA = new ZinggOptions("findTrainingData"); + public final static ZinggOptions LABEL = new ZinggOptions("label"); + public final static ZinggOptions LINK = new ZinggOptions("link"); + public final static ZinggOptions GENERATE_DOCS = new ZinggOptions("generateDocs"); + public final static ZinggOptions RECOMMEND = new ZinggOptions("recommend"); + public final static ZinggOptions UPDATE_LABEL = new ZinggOptions("updateLabel"); + public final static ZinggOptions FIND_AND_LABEL = new ZinggOptions("findAndLabel"); + public final static ZinggOptions ASSESS_MODEL = new ZinggOptions("assessModel"); + public final static ZinggOptions PEEK_MODEL = new ZinggOptions("peekModel"); + public final static ZinggOptions EXPORT_MODEL = new ZinggOptions("exportModel"); + + public static Map allZinggOptions = new HashMap(); + + String name; + + public ZinggOptions(String name) { + this.name = name; + allZinggOptions.put(name, this); } - + + + public static String[] getAllZinggOptions() { - ZinggOptions[] zo = ZinggOptions.values(); + ZinggOptions[] zo = allZinggOptions.values().toArray(new ZinggOptions[allZinggOptions.size()]); int i = 0; String[] s = new String[zo.length]; for (ZinggOptions z: zo) { - s[i++] = z.getValue(); + s[i++] = z.getName(); } return s; } - public String getValue() { - return value; + public String getName() { + return name; } public static final ZinggOptions getByValue(String value){ - for (ZinggOptions zo: ZinggOptions.values()) { - if (zo.value.equals(value)) return zo; + for (ZinggOptions zo: ZinggOptions.allZinggOptions.values()) { + if (zo.name.equals(value)) return zo; } return null; } diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index b58742d61..df6d57bd5 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -86,7 +86,7 @@ public void postMetrics() { Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics); Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics); - Analytics.postEvent(zinggOptions.getValue(), collectMetrics); + Analytics.postEvent(zinggOptions.getName(), collectMetrics); } public Arguments getArgs() { From a37eb249ab30486365a3e11007e3f5124b433cb5 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Mon, 28 Aug 2023 20:48:15 +0530 Subject: [PATCH 008/219] rm license info from oss --- .../main/java/zingg/common/client/Client.java | 5 +--- .../main/java/zingg/common/client/IZingg.java | 4 +-- .../client/license/ILicenseValidator.java | 25 ------------------- .../common/client/license/IZinggLicense.java | 11 -------- .../main/java/zingg/common/core/Context.java | 3 +-- .../common/core/executor/FindAndLabeller.java | 9 +++---- .../common/core/executor/TrainMatcher.java | 9 +++---- .../zingg/common/core/executor/ZinggBase.java | 3 +-- .../java/zingg/spark/client/SparkClient.java | 7 +----- .../spark/core/executor/SparkDocumenter.java | 8 +++--- .../core/executor/SparkFindAndLabeller.java | 8 +++--- .../core/executor/SparkLabelUpdater.java | 8 +++--- .../spark/core/executor/SparkLabeller.java | 8 +++--- .../spark/core/executor/SparkLinker.java | 8 +++--- .../spark/core/executor/SparkMatcher.java | 8 +++--- .../spark/core/executor/SparkPeekModel.java | 6 ++--- .../spark/core/executor/SparkRecommender.java | 8 +++--- .../core/executor/SparkTrainMatcher.java | 8 +++--- .../spark/core/executor/SparkTrainer.java | 8 +++--- .../executor/SparkTrainingDataFinder.java | 8 +++--- .../core/executor/ZinggSparkContext.java | 4 +-- 21 files changed, 58 insertions(+), 108 deletions(-) delete mode 100644 common/client/src/main/java/zingg/common/client/license/ILicenseValidator.java delete mode 100644 common/client/src/main/java/zingg/common/client/license/IZinggLicense.java diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 76242050f..ba1287d72 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -5,7 +5,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.license.IZinggLicense; import zingg.common.client.util.Email; import zingg.common.client.util.EmailBody; @@ -244,13 +243,11 @@ else if (options.get(ClientOptions.CONF).value.endsWith("env")) { } public void init() throws ZinggClientException { - zingg.init(getArguments(), getLicense(options.get(ClientOptions.LICENSE).value.trim())); + zingg.init(getArguments()); if (session != null) zingg.setSession(session); zingg.setClientOptions(options); } - protected abstract IZinggLicense getLicense(String license) throws ZinggClientException ; - /** * Stop the Spark job running context */ diff --git a/common/client/src/main/java/zingg/common/client/IZingg.java b/common/client/src/main/java/zingg/common/client/IZingg.java index ec675ec02..4710b91bc 100644 --- a/common/client/src/main/java/zingg/common/client/IZingg.java +++ b/common/client/src/main/java/zingg/common/client/IZingg.java @@ -1,10 +1,8 @@ package zingg.common.client; -import zingg.common.client.license.IZinggLicense; - public interface IZingg { - public void init(Arguments args, IZinggLicense license) + public void init(Arguments args) throws ZinggClientException; public void execute() throws ZinggClientException; diff --git a/common/client/src/main/java/zingg/common/client/license/ILicenseValidator.java b/common/client/src/main/java/zingg/common/client/license/ILicenseValidator.java deleted file mode 100644 index 92fa47a37..000000000 --- a/common/client/src/main/java/zingg/common/client/license/ILicenseValidator.java +++ /dev/null @@ -1,25 +0,0 @@ -package zingg.common.client.license; - -import java.util.Properties; - -public interface ILicenseValidator { - - public boolean validate(); - - public Properties getLicenseProps(); - - public void setLicenseProps(Properties licenseProps); - - public String getKey(); - - public void setKey(String key); - - public String getValToCheck(); - - public void setValToCheck(String valToCheck); - - public String getName(); - - public void setName(String name); - -} diff --git a/common/client/src/main/java/zingg/common/client/license/IZinggLicense.java b/common/client/src/main/java/zingg/common/client/license/IZinggLicense.java deleted file mode 100644 index 761b5aedb..000000000 --- a/common/client/src/main/java/zingg/common/client/license/IZinggLicense.java +++ /dev/null @@ -1,11 +0,0 @@ -package zingg.common.client.license; - -import java.util.Properties; - -public interface IZinggLicense { - - public ILicenseValidator getValidator(String name); - - public Properties getLicenseProps(); - -} diff --git a/common/core/src/main/java/zingg/common/core/Context.java b/common/core/src/main/java/zingg/common/core/Context.java index d475708ee..f51ba1447 100644 --- a/common/core/src/main/java/zingg/common/core/Context.java +++ b/common/core/src/main/java/zingg/common/core/Context.java @@ -3,7 +3,6 @@ import java.io.Serializable; import zingg.common.client.ZinggClientException; -import zingg.common.client.license.IZinggLicense; import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.DSUtil; import zingg.common.core.util.GraphUtil; @@ -30,7 +29,7 @@ public interface Context extends Serializable { public PipeUtilBase getPipeUtil(); public BlockingTreeUtil getBlockingTreeUtil() ; - public void init(IZinggLicense license) + public void init() throws ZinggClientException; public void cleanup(); diff --git a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java index ef21600e3..899956d57 100644 --- a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java @@ -6,7 +6,6 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; public abstract class FindAndLabeller extends ZinggBase { private static final long serialVersionUID = 1L; @@ -21,10 +20,10 @@ public FindAndLabeller() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - finder.init(args, license); - labeller.init(args, license); - super.init(args, license); + public void init(Arguments args) throws ZinggClientException { + finder.init(args); + labeller.init(args); + super.init(args); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java index 162c033f9..cf65b5ffc 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java @@ -5,7 +5,6 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; public abstract class TrainMatcher extends ZinggBase{ @@ -21,11 +20,11 @@ public TrainMatcher() { } @Override - public void init(Arguments args, IZinggLicense license) + public void init(Arguments args) throws ZinggClientException { - trainer.init(args, license); - matcher.init(args, license); - super.init(args, license); + trainer.init(args); + matcher.init(args); + super.init(args); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index df6d57bd5..e08536bbd 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -14,7 +14,6 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; import zingg.common.core.Context; @@ -63,7 +62,7 @@ public ZinggBase() { - public void init(Arguments args, IZinggLicense license) + public void init(Arguments args) throws ZinggClientException { startTime = System.currentTimeMillis(); this.args = args; diff --git a/spark/client/src/main/java/zingg/spark/client/SparkClient.java b/spark/client/src/main/java/zingg/spark/client/SparkClient.java index 6aba00458..8f975d6c9 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkClient.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkClient.java @@ -13,8 +13,6 @@ import zingg.common.client.ClientOptions; import zingg.common.client.IZinggFactory; import zingg.common.client.ZinggClientException; -import zingg.common.client.license.IZinggLicense; - /** * This is the main point of interface with the Zingg matching product. * @@ -72,10 +70,7 @@ public static void main(String... args) { client.mainMethod(args); } - @Override - protected IZinggLicense getLicense(String license) throws ZinggClientException { - return null; - } + } \ No newline at end of file diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 80ff364ae..954302b24 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.documenter.DataDocumenter; import zingg.common.core.documenter.ModelDocumenter; import zingg.common.core.executor.Documenter; @@ -31,9 +31,9 @@ public SparkDocumenter() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index b3eec4f77..1deb9d8e7 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -11,7 +11,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.FindAndLabeller; import org.apache.spark.sql.SparkSession; @@ -30,9 +30,9 @@ public SparkFindAndLabeller() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java index 63ed97548..1a942a9b3 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java @@ -11,7 +11,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.client.pipe.Pipe; import zingg.common.core.executor.LabelUpdater; import org.apache.spark.sql.SparkSession; @@ -35,9 +35,9 @@ public SparkLabelUpdater() { @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } protected Pipe setSaveModeOnPipe(Pipe p) { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index 395ad27ee..010205309 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.Labeller; import org.apache.spark.sql.SparkSession; @@ -35,9 +35,9 @@ public SparkLabeller(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index eb90d0bd3..9c7cabb20 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.Linker; import zingg.common.core.model.Model; import zingg.common.core.preprocess.StopWordsRemover; @@ -30,9 +30,9 @@ public SparkLinker() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index f99caeeab..722a0e4e1 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -11,7 +11,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.Matcher; import zingg.common.core.model.Model; import zingg.common.core.preprocess.StopWordsRemover; @@ -40,9 +40,9 @@ public SparkMatcher(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + // getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index e63f942f2..f4c2666e8 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -15,7 +15,7 @@ import zingg.common.client.ClientOptions; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.ZinggBase; import org.apache.spark.sql.SparkSession; @@ -32,9 +32,9 @@ public SparkPeekModel() { } @Override - public void init(Arguments args, IZinggLicense license) + public void init(Arguments args) throws ZinggClientException { - super.init(args, license); + super.init(args); getContext().setUtils(); //we wil not init here as we wnt py to drive //the spark session etc diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 2fdf029c5..a156a07d8 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.Recommender; import zingg.common.core.recommender.StopWordsRecommender; import org.apache.spark.sql.SparkSession; @@ -33,9 +33,9 @@ public SparkRecommender() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index e0a7e54b3..3a9143adc 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.TrainMatcher; import org.apache.spark.sql.SparkSession; @@ -29,9 +29,9 @@ public SparkTrainMatcher() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index 4b770f3a2..cdf3a3e66 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.Trainer; import zingg.common.core.preprocess.StopWordsRemover; import org.apache.spark.sql.SparkSession; @@ -33,9 +33,9 @@ public SparkTrainer(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index f38a1edd7..6f8c92b0a 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; -import zingg.common.client.license.IZinggLicense; + import zingg.common.core.executor.TrainingDataFinder; import zingg.common.core.preprocess.StopWordsRemover; import org.apache.spark.sql.SparkSession; @@ -32,9 +32,9 @@ public SparkTrainingDataFinder(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { - super.init(args, license); - getContext().init(license); + public void init(Arguments args) throws ZinggClientException { + super.init(args); + //getContext().init(license); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java b/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java index bfc86dafd..4c5a5799f 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java @@ -11,7 +11,7 @@ import zingg.common.client.IZingg; import zingg.common.client.ZinggClientException; -import zingg.common.client.license.IZinggLicense; +// import zingg.common.core.Context; import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.DSUtil; @@ -59,7 +59,7 @@ public void setSession(SparkSession spark) { @Override - public void init(IZinggLicense license) + public void init() throws ZinggClientException { try{ if (zSession==null) { From 13a3d5b4bcd6714d119161f1c2f6f3120f9094e9 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Tue, 5 Sep 2023 18:37:39 +0530 Subject: [PATCH 009/219] argument removed from writing df, zinggclientexception passed for im --- .DS_Store | Bin 6148 -> 6148 bytes .../zingg/common/core/executor/Linker.java | 2 +- .../zingg/common/core/executor/Matcher.java | 16 +++++---- .../core/executor/TrainingDataFinder.java | 2 +- .../core/executor/TrainingDataModel.java | 2 +- .../java/zingg/common/core/model/Model.java | 33 +++++++++++++----- .../recommender/StopWordsRecommender.java | 2 +- .../common/core/util/BlockingTreeUtil.java | 2 +- .../java/zingg/common/core/util/PipeUtil.java | 2 +- .../zingg/common/core/util/PipeUtilBase.java | 2 +- .../zingg/spark/core/model/SparkModel.java | 29 ++++++++++----- 11 files changed, 63 insertions(+), 29 deletions(-) diff --git a/.DS_Store b/.DS_Store index dd8ddd3f92f86e0123d4095872267ae41f0c0c1b..a52ee2cd7c9efb392fda7aa92ada3832eba598d5 100644 GIT binary patch delta 322 zcmZoMXfc=|#>B)qu~2NHo+2aj#DLw4H!w0XGEe4Vl&|MtNM)#CNMy)mC}79|vWt_- ziwlx+@{<@C7Bn1{L;LXVxZo7pivMR4o(iv zcmav(YC{7P9R-709R(1}Tt~ssz|^$1mXkwNS>HM+K07BjFTV@uI3QqTgwPDUP#Q*c z0~r_|D+?~l%gN762gB`mu~2NHo+2ab#DLw5tdn_|G8xj#f{XHU^7GOe z7#J9Vn3bUzD4WQT!;pw3!w!}y0_rJXC}Ai@lVt&lr2P7EM(doAhLxS0C0vTRR910 diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index d28c8fef8..7e7f402ca 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -46,7 +46,7 @@ public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws ZFramedupes2 = getDSUtil().alignLinked(dupesActual, args); dupes2 = getDSUtil().postprocessLinked(dupes2, sampleOrginal); LOG.debug("uncertain output schema is " + dupes2.showSchema()); - getPipeUtil().write(dupes2, args, args.getOutput()); + getPipeUtil().write(dupes2, args.getOutput()); } } catch (Exception e) { e.printStackTrace(); diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index aef6ea486..776608597 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -193,7 +193,7 @@ public ZFrame removeObvDupesFromBlocks(ZFrame blocks) { return blocks; } - public ZFrame getObvDupePairs(ZFrame blocked) { + public ZFrame getObvDupePairs(ZFrame blocked) throws ZinggClientException { String obviousDupeString = args.getObviousDupeCondition(); @@ -230,6 +230,10 @@ public ZFrame getObvDupePairs(ZFrame blocked) { return onlyIds; } + + public ZFrame getGraph(ZFrame blocked, ZFrame dupesActual){ + return getGraphUtil().buildGraph(blocked, dupesActual).cache(); + } public void writeOutput( ZFrame blocked, ZFrame dupesActual) throws ZinggClientException { try{ @@ -246,13 +250,13 @@ public void writeOutput( ZFrame blocked, ZFrame dupesActual) th */ dupesActual = dupesActual.cache(); - System.out.println("dupes ------------"); + if (LOG.isDebugEnabled()) { dupesActual.show(); } - ZFramegraph = getGraphUtil().buildGraph(blocked, dupesActual).cache(); + ZFramegraph = getGraph(blocked, dupesActual); //graph.toJavaRDD().saveAsTextFile("/tmp/zgraph"); - System.out.println("graph ------------"); + if (LOG.isDebugEnabled()) { graph.show(); } @@ -278,7 +282,7 @@ public void writeOutput( ZFrame blocked, ZFrame dupesActual) th } graphWithScores = getDSUtil().select(graphWithScores, columns); */ - getPipeUtil().write(graphWithScores, args, args.getOutput()); + getPipeUtil().write(graphWithScores, args.getOutput()); } } catch(Exception e) { @@ -287,7 +291,7 @@ public void writeOutput( ZFrame blocked, ZFrame dupesActual) th } - public ZFrame getGraphWithScores(ZFrame graph, ZFrame score) { + public ZFrame getGraphWithScores(ZFrame graph, ZFrame score) throws ZinggClientException { ZFramegraphWithScores = getDSUtil().joinZColFirst( score, graph, ColName.ID_COL, false).cache(); return graphWithScores; diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java index 50502afcf..0f93c723d 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java @@ -144,7 +144,7 @@ public void writeUncertain(ZFrame dupesActual, ZFrame sampleOrgina dupes1 = getDSUtil().postprocess(dupes1, sampleOrginal); ZFrame dupes2 = dupes1.orderBy(ColName.CLUSTER_COLUMN); //LOG.debug("uncertain output schema is " + dupes2.schema()); - getPipeUtil().write(dupes2 , args, getUnmarkedLocation()); + getPipeUtil().write(dupes2 , getUnmarkedLocation()); //PipeUtil.write(jdbc, massageForJdbc(dupes2.cache()) , args, ctx); } diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java index cd5fe1ebd..c8515959b 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java @@ -79,7 +79,7 @@ public void writeLabelledOutput(ZFrame records, Arguments args, Pipe p) t LOG.warn("No labelled records"); return; } - getPipeUtil().write(records, args,p); + getPipeUtil().write(records, p); } public Pipe getOutputPipe(Arguments args) { diff --git a/common/core/src/main/java/zingg/common/core/model/Model.java b/common/core/src/main/java/zingg/common/core/model/Model.java index ef086514b..33b7989d2 100644 --- a/common/core/src/main/java/zingg/common/core/model/Model.java +++ b/common/core/src/main/java/zingg/common/core/model/Model.java @@ -4,24 +4,34 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; -import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.FieldDefinition; + +import zingg.common.client.Arguments; import zingg.common.client.ZFrame; -import zingg.common.core.feature.Feature; +import zingg.common.client.ZinggClientException; public abstract class Model implements Serializable { public static final Log LOG = LogFactory.getLog(Model.class); - //private Map featurers; + private S session; + + public Model() { } public abstract void register(S spark) ; + + public void setSession(S s){ + this.session = s; + } + + public S getSession(){ + return session; + } public static double[] getGrid(double begin, double end, double jump, boolean isMultiple) { List alphaList = new ArrayList(); @@ -29,6 +39,7 @@ public static double[] getGrid(double begin, double end, double jump, boolean is for (double alpha =begin; alpha <= end; alpha *= jump) { alphaList.add(alpha); } + } else { for (double alpha =begin; alpha <= end; alpha += jump) { @@ -42,18 +53,24 @@ public static double[] getGrid(double begin, double end, double jump, boolean is return retArr; } - public abstract void fit(ZFrame pos, ZFrame neg); + public abstract void fit(ZFrame pos, ZFrame neg) throws ZinggClientException; public abstract void load(String path); + + public abstract void fitCore(ZFrame pos, ZFrame neg); + + public abstract ZFrame predict(ZFrame data) throws ZinggClientException; + public abstract ZFrame predict(ZFrame data, boolean isDrop) throws ZinggClientException ; - public abstract ZFrame predict(ZFrame data); - - public abstract ZFrame predict(ZFrame data, boolean isDrop) ; + //this will do the prediction but not drop the columns + public abstract ZFrame predictCore(ZFrame data); public abstract void save(String path) throws IOException; public abstract ZFrame transform(ZFrame input); + + public abstract ZFrame dropFeatureCols(ZFrame f, boolean isDrop); } diff --git a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java index 6105f63a0..3feab36cb 100644 --- a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java +++ b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java @@ -49,7 +49,7 @@ public void createStopWordsDocuments(ZFrame data, String fieldName) throw if(Arrays.asList(data.columns()).contains(args.getColumn())) { String filenameCSV = args.getStopWordsDir() + fieldName; data = findStopWords(data, fieldName); - context.getPipeUtil().write(data, args, context.getPipeUtil().getStopWordsPipe(args, filenameCSV)); + context.getPipeUtil().write(data, context.getPipeUtil().getStopWordsPipe(args, filenameCSV)); } else { LOG.info("An invalid column name - " + args.getColumn() + " entered. Please provide valid column name."); } diff --git a/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java b/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java index 959fbfa7a..db8cc63e6 100644 --- a/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java @@ -90,7 +90,7 @@ public Tree> createBlockingTreeFromSample(ZFrame testData, public void writeBlockingTree(Tree> blockingTree, Arguments args) throws Exception, ZinggClientException { byte[] byteArray = Util.convertObjectIntoByteArray(blockingTree); PipeUtilBase pu = getPipeUtil(); - pu.write(getTreeDF(byteArray), args, pu.getBlockingTreePipe(args)); + pu.write(getTreeDF(byteArray), pu.getBlockingTreePipe(args)); } public abstract ZFrame getTreeDF(byte[] tree) ; diff --git a/common/core/src/main/java/zingg/common/core/util/PipeUtil.java b/common/core/src/main/java/zingg/common/core/util/PipeUtil.java index bd527cc5a..e7cb6ac63 100644 --- a/common/core/src/main/java/zingg/common/core/util/PipeUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/PipeUtil.java @@ -185,7 +185,7 @@ public ZFrame read(boolean addExtraCol, boolean addLineNo, int numPartit return rows; } - public void write(ZFrame toWriteOrig, Arguments args, + public void write(ZFrame toWriteOrig, Pipe... pipes) throws ZinggClientException { try { for (Pipe p: pipes) { diff --git a/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java b/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java index 887ddc054..a7d447041 100644 --- a/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java +++ b/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java @@ -29,7 +29,7 @@ public ZFrame read(boolean addLineNo, int numPartitions, public ZFrame read(boolean addExtraCol, boolean addLineNo, int numPartitions, boolean addSource, Pipe... pipes) throws ZinggClientException; - public void write(ZFrame toWriteOrig, Arguments args, Pipe... pipes) + public void write(ZFrame toWriteOrig, Pipe... pipes) throws ZinggClientException; diff --git a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java index 72a17763f..4b95b6a41 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java +++ b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java @@ -93,9 +93,11 @@ public SparkModel(Map> f) { columnsAdded.add(ColName.RAW_PREDICTION); } - - public void fit(ZFrame,Row,Column> pos, ZFrame,Row,Column> neg) { + fitCore(pos, neg); + } + + public void fitCore(ZFrame,Row,Column> pos, ZFrame,Row,Column> neg) { //transform ZFrame,Row,Column> input = transform(pos.union(neg)).coalesce(1).cache(); //if (LOG.isDebugEnabled()) input.write().csv("/tmp/input/" + System.currentTimeMillis()); @@ -127,13 +129,28 @@ public void load(String path) { transformer = CrossValidatorModel.load(path); } - public ZFrame,Row,Column> predict(ZFrame,Row,Column> data) { return predict(data, true); } @Override public ZFrame,Row,Column> predict(ZFrame,Row,Column> data, boolean isDrop) { + return dropFeatureCols(predictCore(data), isDrop); + } + + + @Override + public ZFrame,Row,Column> dropFeatureCols(ZFrame,Row,Column> predictWithFeatures, boolean isDrop){ + if (isDrop) { + ZFrame,Row,Column> returnDS = predictWithFeatures.drop(columnsAdded.toArray(new String[columnsAdded.size()])); + //LOG.debug("Return schema after dropping additional columns is " + returnDS.schema()); + return returnDS; //new SparkFrame(returnDS); + } + return predictWithFeatures; + } + + @Override + public ZFrame,Row,Column> predictCore(ZFrame,Row,Column> data) { //create features LOG.info("threshold while predicting is " + lr.getThreshold()); //lr.setThreshold(0.95); @@ -143,11 +160,7 @@ public ZFrame,Row,Column> predict(ZFrame,Row,Column> d //LOG.debug(predictWithFeatures.schema()); predictWithFeatures = vve.transform(predictWithFeatures); //LOG.debug("Original schema is " + predictWithFeatures.schema()); - if (isDrop) { - Dataset returnDS = predictWithFeatures.drop(columnsAdded.toArray(new String[columnsAdded.size()])); - //LOG.debug("Return schema after dropping additional columns is " + returnDS.schema()); - return new SparkFrame(returnDS); - } + LOG.debug("Return schema is " + predictWithFeatures.schema()); return new SparkFrame(predictWithFeatures); From 877e8e13a83489aff43378db80ce354b791acd7e Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Tue, 5 Sep 2023 23:11:37 +0530 Subject: [PATCH 010/219] more model refactoring for enterrpise --- .../main/java/zingg/common/client/ZFrame.java | 1 + .../java/zingg/common/core/model/Model.java | 18 +++++++++++++++++- .../zingg/spark/core/model/SparkModel.java | 9 +++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ZFrame.java b/common/client/src/main/java/zingg/common/client/ZFrame.java index 417190771..1f910d09a 100644 --- a/common/client/src/main/java/zingg/common/client/ZFrame.java +++ b/common/client/src/main/java/zingg/common/client/ZFrame.java @@ -22,6 +22,7 @@ public interface ZFrame { public ZFrame select(C... cols); public ZFrame select(List cols); public ZFrame select(String col, String... cols); + //public ZFrame select(String... cols); public ZFrame select(String col); public ZFrame selectExpr(String... col); public ZFrame distinct(); diff --git a/common/core/src/main/java/zingg/common/core/model/Model.java b/common/core/src/main/java/zingg/common/core/model/Model.java index 33b7989d2..efcda0f93 100644 --- a/common/core/src/main/java/zingg/common/core/model/Model.java +++ b/common/core/src/main/java/zingg/common/core/model/Model.java @@ -11,11 +11,13 @@ import zingg.common.client.Arguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; +import zingg.common.client.util.ColName; public abstract class Model implements Serializable { public static final Log LOG = LogFactory.getLog(Model.class); private S session; + protected List columnsAdded = new ArrayList(); @@ -32,7 +34,21 @@ public void setSession(S s){ public S getSession(){ return session; } + + protected String getColumnName(String fieldName, String fnName, int count) { + return ColName.SIM_COL + count; + } + + public List getColumnsAdded() { + return columnsAdded; + } + + + public void setColumnsAdded(List columnsAdded) { + this.columnsAdded = columnsAdded; + } + public static double[] getGrid(double begin, double end, double jump, boolean isMultiple) { List alphaList = new ArrayList(); if (isMultiple) { @@ -58,7 +74,7 @@ public static double[] getGrid(double begin, double end, double jump, boolean is public abstract void load(String path); - public abstract void fitCore(ZFrame pos, ZFrame neg); + public abstract ZFrame fitCore(ZFrame pos, ZFrame neg); public abstract ZFrame predict(ZFrame data) throws ZinggClientException; diff --git a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java index 4b95b6a41..c6f83ec23 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java +++ b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java @@ -44,19 +44,19 @@ public class SparkModel extends Model, Row, LogisticRegression lr; Transformer transformer; BinaryClassificationEvaluator binaryClassificationEvaluator; - List columnsAdded; + VectorValueExtractor vve; public SparkModel(Map> f) { featureCreators = new ArrayList(); pipelineStage = new ArrayList (); - columnsAdded = new ArrayList (); int count = 0; for (FieldDefinition fd : f.keySet()) { Feature fea = f.get(fd); List sfList = fea.getSimFunctions(); for (SimFunction sf : sfList) { - String outputCol = ColName.SIM_COL + count; + + String outputCol = getColumnName(fd.fieldName, sf.getName(), count); columnsAdded.add(outputCol); SparkTransformer st = new SparkTransformer(fd.fieldName, new SparkSimFunction(sf), outputCol); count++; @@ -97,7 +97,7 @@ public void fit(ZFrame,Row,Column> pos, ZFrame,Row,Col fitCore(pos, neg); } - public void fitCore(ZFrame,Row,Column> pos, ZFrame,Row,Column> neg) { + public ZFrame,Row,Column> fitCore(ZFrame,Row,Column> pos, ZFrame,Row,Column> neg) { //transform ZFrame,Row,Column> input = transform(pos.union(neg)).coalesce(1).cache(); //if (LOG.isDebugEnabled()) input.write().csv("/tmp/input/" + System.currentTimeMillis()); @@ -122,6 +122,7 @@ public void fitCore(ZFrame,Row,Column> pos, ZFrame,Row CrossValidatorModel cvModel = cv.fit(input.df()); transformer = cvModel; LOG.debug("threshold after fitting is " + lr.getThreshold()); + return input; } From 944b59f24ee37bcd53a4989b0b44c7c17bdb4120 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Tue, 5 Sep 2023 23:36:00 +0530 Subject: [PATCH 011/219] more model restruc for ent --- .../src/main/java/zingg/common/core/model/Model.java | 12 ++++++++++-- .../main/java/zingg/spark/core/model/SparkModel.java | 10 +--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/model/Model.java b/common/core/src/main/java/zingg/common/core/model/Model.java index efcda0f93..c39efe372 100644 --- a/common/core/src/main/java/zingg/common/core/model/Model.java +++ b/common/core/src/main/java/zingg/common/core/model/Model.java @@ -17,7 +17,8 @@ public abstract class Model implements Serializable { public static final Log LOG = LogFactory.getLog(Model.class); private S session; - protected List columnsAdded = new ArrayList(); + protected + List columnsAdded = new ArrayList(); @@ -87,6 +88,13 @@ public static double[] getGrid(double begin, double end, double jump, boolean is public abstract ZFrame transform(ZFrame input); - public abstract ZFrame dropFeatureCols(ZFrame f, boolean isDrop); + public ZFrame dropFeatureCols(ZFrame predictWithFeatures, boolean isDrop){ + if (isDrop) { + ZFrame returnDS = predictWithFeatures.drop(columnsAdded.toArray(new String[columnsAdded.size()])); + //LOG.debug("Return schema after dropping additional columns is " + returnDS.schema()); + return returnDS; //new SparkFrame(returnDS); + } + return predictWithFeatures; + } } diff --git a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java index c6f83ec23..8b8a63d79 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java +++ b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java @@ -140,15 +140,7 @@ public ZFrame,Row,Column> predict(ZFrame,Row,Column> d } - @Override - public ZFrame,Row,Column> dropFeatureCols(ZFrame,Row,Column> predictWithFeatures, boolean isDrop){ - if (isDrop) { - ZFrame,Row,Column> returnDS = predictWithFeatures.drop(columnsAdded.toArray(new String[columnsAdded.size()])); - //LOG.debug("Return schema after dropping additional columns is " + returnDS.schema()); - return returnDS; //new SparkFrame(returnDS); - } - return predictWithFeatures; - } + @Override public ZFrame,Row,Column> predictCore(ZFrame,Row,Column> data) { From 8d6272fb564a1555c86d8e735676ea284e623996 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 6 Sep 2023 10:25:41 +0530 Subject: [PATCH 012/219] model refactoring --- .../java/zingg/common/core/model/Model.java | 20 ++++--------------- .../zingg/spark/core/model/SparkModel.java | 4 ++-- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/model/Model.java b/common/core/src/main/java/zingg/common/core/model/Model.java index c39efe372..5c4c46bce 100644 --- a/common/core/src/main/java/zingg/common/core/model/Model.java +++ b/common/core/src/main/java/zingg/common/core/model/Model.java @@ -16,25 +16,13 @@ public abstract class Model implements Serializable { public static final Log LOG = LogFactory.getLog(Model.class); - private S session; - protected - List columnsAdded = new ArrayList(); + protected List columnsAdded = new ArrayList(); - - - public Model() { - } public abstract void register(S spark) ; - public void setSession(S s){ - this.session = s; - } - - public S getSession(){ - return session; - } + protected String getColumnName(String fieldName, String fnName, int count) { return ColName.SIM_COL + count; @@ -75,14 +63,14 @@ public static double[] getGrid(double begin, double end, double jump, boolean is public abstract void load(String path); - public abstract ZFrame fitCore(ZFrame pos, ZFrame neg); + protected abstract ZFrame fitCore(ZFrame pos, ZFrame neg); public abstract ZFrame predict(ZFrame data) throws ZinggClientException; public abstract ZFrame predict(ZFrame data, boolean isDrop) throws ZinggClientException ; //this will do the prediction but not drop the columns - public abstract ZFrame predictCore(ZFrame data); + protected abstract ZFrame predictCore(ZFrame data); public abstract void save(String path) throws IOException; diff --git a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java index 8b8a63d79..a8ce385b7 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java +++ b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java @@ -97,7 +97,7 @@ public void fit(ZFrame,Row,Column> pos, ZFrame,Row,Col fitCore(pos, neg); } - public ZFrame,Row,Column> fitCore(ZFrame,Row,Column> pos, ZFrame,Row,Column> neg) { + protected ZFrame,Row,Column> fitCore(ZFrame,Row,Column> pos, ZFrame,Row,Column> neg) { //transform ZFrame,Row,Column> input = transform(pos.union(neg)).coalesce(1).cache(); //if (LOG.isDebugEnabled()) input.write().csv("/tmp/input/" + System.currentTimeMillis()); @@ -143,7 +143,7 @@ public ZFrame,Row,Column> predict(ZFrame,Row,Column> d @Override - public ZFrame,Row,Column> predictCore(ZFrame,Row,Column> data) { + protected ZFrame,Row,Column> predictCore(ZFrame,Row,Column> data) { //create features LOG.info("threshold while predicting is " + lr.getThreshold()); //lr.setThreshold(0.95); From 1c403b85db86b06ad99c72120cb1441a32e584b2 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 6 Sep 2023 13:05:25 +0530 Subject: [PATCH 013/219] model session constructor --- .../main/java/zingg/common/core/model/Model.java | 16 ++++++++++++++-- .../java/zingg/common/core/util/ModelUtil.java | 6 +++++- .../zingg/spark/core/executor/SparkLinker.java | 2 +- .../zingg/spark/core/executor/SparkMatcher.java | 2 +- .../spark/core/executor/ZinggSparkContext.java | 1 - .../zingg/spark/core/model/SparkLabelModel.java | 5 +++-- .../java/zingg/spark/core/model/SparkModel.java | 10 ++++++---- .../zingg/spark/core/util/SparkModelUtil.java | 6 +++--- 8 files changed, 33 insertions(+), 15 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/model/Model.java b/common/core/src/main/java/zingg/common/core/model/Model.java index 5c4c46bce..461a4103d 100644 --- a/common/core/src/main/java/zingg/common/core/model/Model.java +++ b/common/core/src/main/java/zingg/common/core/model/Model.java @@ -17,10 +17,22 @@ public abstract class Model implements Serializable { public static final Log LOG = LogFactory.getLog(Model.class); protected List columnsAdded = new ArrayList(); + protected S session; + + public void setSession(S s){ + this.session = s; + } + + public S getSession(){ + return session; + } + + public Model(S s){ + this.session = s; + } - - public abstract void register(S spark) ; + public abstract void register() ; diff --git a/common/core/src/main/java/zingg/common/core/util/ModelUtil.java b/common/core/src/main/java/zingg/common/core/util/ModelUtil.java index c3b61439c..0cd277705 100644 --- a/common/core/src/main/java/zingg/common/core/util/ModelUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/ModelUtil.java @@ -22,6 +22,10 @@ public abstract class ModelUtil { public static final Log LOG = LogFactory.getLog(ModelUtil.class); protected Map> featurers; protected S session; + + public ModelUtil(S s) { + this.session = s; + } public abstract FeatureFactory getFeatureFactory(); @@ -72,7 +76,7 @@ public Model createModel(ZFrame positives, + negLabeledPointsWithLabel.count()); } Model model = getModel(isLabel, args); - model.register(session); + model.register(); model.fit(posLabeledPointsWithLabel, negLabeledPointsWithLabel); return model; } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index 9c7cabb20..0be72e618 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -38,7 +38,7 @@ public void init(Arguments args) throws ZinggClientException { @Override public Model getModel() throws ZinggClientException { Model model = getModelUtil().loadModel(false, args); - model.register(getContext().getSession()); + model.register(); return model; } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index 722a0e4e1..18e4f8594 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -49,7 +49,7 @@ public void init(Arguments args) throws ZinggClientException { @Override public Model getModel() throws ZinggClientException { Model model = getModelUtil().loadModel(false, args); - model.register(getContext().getSession()); + model.register(); return model; } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java b/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java index 4c5a5799f..074806806 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java @@ -19,7 +19,6 @@ import zingg.common.core.util.HashUtil; import zingg.common.core.util.ModelUtil; import zingg.common.core.util.PipeUtilBase; -import org.apache.spark.sql.SparkSession; import zingg.spark.core.util.SparkBlockingTreeUtil; import zingg.spark.core.util.SparkDSUtil; import zingg.spark.core.util.SparkGraphUtil; diff --git a/spark/core/src/main/java/zingg/spark/core/model/SparkLabelModel.java b/spark/core/src/main/java/zingg/spark/core/model/SparkLabelModel.java index 18f563050..d7c5f32ef 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/SparkLabelModel.java +++ b/spark/core/src/main/java/zingg/spark/core/model/SparkLabelModel.java @@ -2,6 +2,7 @@ import java.util.Map; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataType; import zingg.common.client.FieldDefinition; @@ -11,8 +12,8 @@ public class SparkLabelModel extends SparkModel{ private static final long serialVersionUID = 1L; - public SparkLabelModel(Map> f) { - super(f); + public SparkLabelModel(SparkSession s, Map> f) { + super(s,f); } } diff --git a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java index a8ce385b7..607962429 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java +++ b/spark/core/src/main/java/zingg/spark/core/model/SparkModel.java @@ -22,6 +22,7 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.SparkSession; import zingg.common.client.FieldDefinition; import zingg.common.client.ZFrame; @@ -47,7 +48,8 @@ public class SparkModel extends Model, Row, VectorValueExtractor vve; - public SparkModel(Map> f) { + public SparkModel(SparkSession s, Map> f) { + super(s); featureCreators = new ArrayList(); pipelineStage = new ArrayList (); int count = 0; @@ -177,13 +179,13 @@ public ZFrame,Row,Column> transform(ZFrame,Row,Column> @Override - public void register(SparkSession spark) { + public void register() { if (featureCreators != null) { for (SparkTransformer bsf: featureCreators) { - bsf.register(spark); + bsf.register(session); } } - vve.register(spark); + vve.register(session); } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java index 5967e7c72..3099e20ce 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java @@ -23,16 +23,16 @@ public class SparkModelUtil extends ModelUtil public SparkModelUtil(SparkSession s) { - this.session = s; + super(s); } public Model, Row, Column> getModel(boolean isLabel, Arguments args) throws ZinggClientException{ Model, Row, Column> model = null; if (isLabel) { - model = new SparkLabelModel(getFeaturers(args)); + model = new SparkLabelModel(session, getFeaturers(args)); } else { - model = new SparkModel(getFeaturers(args)); + model = new SparkModel(session, getFeaturers(args)); } return model; } From 06e50b472993b484e7acae5e667aebf441e425ee Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 6 Sep 2023 13:53:04 +0530 Subject: [PATCH 014/219] context changes --- .../main/java/zingg/common/core/Context.java | 57 ------ .../zingg/common/core/context/Context.java | 89 ++++++++ .../core/documenter/DataColDocumenter.java | 2 +- .../core/documenter/DataDocumenter.java | 2 +- .../core/documenter/DocumenterBase.java | 2 +- .../core/documenter/ModelColDocumenter.java | 2 +- .../core/documenter/ModelDocumenter.java | 2 +- .../core/executor/LabelDataViewHelper.java | 2 +- .../core/executor/TrainingDataModel.java | 2 +- .../zingg/common/core/executor/ZinggBase.java | 2 +- .../core/preprocess/StopWordsRemover.java | 2 +- .../recommender/StopWordsRecommender.java | 2 +- .../spark/core/context/ZinggSparkContext.java | 95 +++++++++ .../documenter/SparkDataColDocumenter.java | 2 +- .../core/documenter/SparkDataDocumenter.java | 2 +- .../documenter/SparkModelColDocumenter.java | 2 +- .../core/documenter/SparkModelDocumenter.java | 2 +- .../spark/core/executor/SparkDocumenter.java | 1 + .../core/executor/SparkFindAndLabeller.java | 4 +- .../core/executor/SparkLabelUpdater.java | 1 + .../spark/core/executor/SparkLabeller.java | 4 +- .../spark/core/executor/SparkLinker.java | 1 + .../spark/core/executor/SparkMatcher.java | 2 +- .../spark/core/executor/SparkPeekModel.java | 4 +- .../spark/core/executor/SparkRecommender.java | 1 + .../core/executor/SparkTrainMatcher.java | 1 + .../spark/core/executor/SparkTrainer.java | 6 +- .../executor/SparkTrainingDataFinder.java | 2 +- .../core/executor/ZinggSparkContext.java | 191 ------------------ .../preprocess/SparkStopWordsRemover.java | 2 +- .../SparkStopWordsRecommender.java | 2 +- 31 files changed, 220 insertions(+), 271 deletions(-) delete mode 100644 common/core/src/main/java/zingg/common/core/Context.java create mode 100644 common/core/src/main/java/zingg/common/core/context/Context.java create mode 100644 spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java delete mode 100644 spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java diff --git a/common/core/src/main/java/zingg/common/core/Context.java b/common/core/src/main/java/zingg/common/core/Context.java deleted file mode 100644 index f51ba1447..000000000 --- a/common/core/src/main/java/zingg/common/core/Context.java +++ /dev/null @@ -1,57 +0,0 @@ -package zingg.common.core; - -import java.io.Serializable; - -import zingg.common.client.ZinggClientException; -import zingg.common.core.util.BlockingTreeUtil; -import zingg.common.core.util.DSUtil; -import zingg.common.core.util.GraphUtil; -import zingg.common.core.util.HashUtil; -import zingg.common.core.util.ModelUtil; -import zingg.common.core.util.PipeUtilBase; - -public interface Context extends Serializable { - - public HashUtil getHashUtil() ; - public void setHashUtil(HashUtil t) ; - public GraphUtil getGraphUtil() ; - - public void setGraphUtil(GraphUtil t) ; - - public void setModelUtil(ModelUtil t); - public void setBlockingTreeUtil(BlockingTreeUtil t) ; - - public ModelUtil getModelUtil(); - - public void setPipeUtil(PipeUtilBase pipeUtil); - public void setDSUtil(DSUtil pipeUtil); - public DSUtil getDSUtil() ; - public PipeUtilBase getPipeUtil(); - public BlockingTreeUtil getBlockingTreeUtil() ; - - public void init() - throws ZinggClientException; - - public void cleanup(); - - /**convenience method to set all utils - * especially useful when you dont want to create the connection/spark context etc - * */ - public void setUtils(); - - public S getSession(); - - public void setSession(S session); - - - //public void initHashFns() throws ZinggClientException; - - - - - - } - - - - diff --git a/common/core/src/main/java/zingg/common/core/context/Context.java b/common/core/src/main/java/zingg/common/core/context/Context.java new file mode 100644 index 000000000..130cbbe5c --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/context/Context.java @@ -0,0 +1,89 @@ +package zingg.common.core.context; + +import java.io.Serializable; + +import zingg.common.client.ZinggClientException; +import zingg.common.core.util.BlockingTreeUtil; +import zingg.common.core.util.DSUtil; +import zingg.common.core.util.GraphUtil; +import zingg.common.core.util.HashUtil; +import zingg.common.core.util.ModelUtil; +import zingg.common.core.util.PipeUtilBase; + +public abstract class Context implements Serializable { + protected S session; + protected PipeUtilBase pipeUtil; + protected HashUtil hashUtil; + protected DSUtil dsUtil; + protected GraphUtil graphUtil; + protected ModelUtil modelUtil; + protected BlockingTreeUtil blockingTreeUtil; + + public static final String hashFunctionFile = "hashFunctions.json"; + + public HashUtil getHashUtil() { + return this.hashUtil; + } + public void setHashUtil(HashUtil t) { + this.hashUtil = t; + } + public GraphUtil getGraphUtil() { + return this.graphUtil; + } + + public void setGraphUtil(GraphUtil t) { + this.graphUtil = t; + } + + public void setModelUtil(ModelUtil t){ + this.modelUtil = t; + } + public void setBlockingTreeUtil(BlockingTreeUtil t) { + this.blockingTreeUtil = t; + } + + public ModelUtil getModelUtil(){ + return this.modelUtil; + } + + public void setPipeUtil(PipeUtilBase pipeUtil){ + this.pipeUtil = pipeUtil; + } + public void setDSUtil(DSUtil d){ + this.dsUtil = d; + } + public DSUtil getDSUtil() { + return this.dsUtil; + } + public PipeUtilBase getPipeUtil(){ + return this.pipeUtil; + } + public BlockingTreeUtil getBlockingTreeUtil() { + return this.blockingTreeUtil; + } + + public abstract void init() + throws ZinggClientException; + + public abstract void cleanup(); + + /**convenience method to set all utils + * especially useful when you dont want to create the connection/spark context etc + * */ + public abstract void setUtils(); + + public S getSession(){ + return session; + } + + public void setSession(S session){ + this.session = session; + } + + + + } + + + + diff --git a/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java index 73d29141e..b8a357975 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java @@ -6,7 +6,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.core.Context; +import zingg.common.core.context.Context; public abstract class DataColDocumenter extends DocumenterBase { protected static String name = "zingg.DataColDocumenter"; diff --git a/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java index f3dea702c..34cff617e 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java @@ -12,7 +12,7 @@ import zingg.common.client.FieldData; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.core.Context; +import zingg.common.core.context.Context; public abstract class DataDocumenter extends DocumenterBase { protected static String name = "zingg.DataDocumenter"; diff --git a/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java b/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java index 7b715d7d4..d19dcfed1 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java +++ b/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java @@ -12,7 +12,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.executor.ZinggBase; public abstract class DocumenterBase extends ZinggBase{ diff --git a/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java index f443cdc5c..9ad598aab 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java @@ -9,7 +9,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.core.Context; +import zingg.common.core.context.Context; public abstract class ModelColDocumenter extends DocumenterBase { protected static String name = "zingg.ModelColDocumenter"; diff --git a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java index a5ba1586d..c1cf78279 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java @@ -15,7 +15,7 @@ import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; -import zingg.common.core.Context; +import zingg.common.core.context.Context; public abstract class ModelDocumenter extends DocumenterBase { diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java index 65a650a40..c63f7dd6a 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java @@ -13,7 +13,7 @@ import zingg.common.client.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.util.LabelMatchType; public class LabelDataViewHelper extends ZinggBase implements ILabelDataViewHelper { diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java index c8515959b..b50bbbe8f 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java @@ -12,7 +12,7 @@ import zingg.common.client.pipe.Pipe; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; -import zingg.common.core.Context; +import zingg.common.core.context.Context; public class TrainingDataModel extends ZinggBase implements ITrainingDataModel{ diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index e08536bbd..49b0a4d0a 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -16,7 +16,7 @@ import zingg.common.client.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.util.Analytics; import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.DSUtil; diff --git a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java index 12e4343cf..bd3ddb50a 100644 --- a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java +++ b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java @@ -13,7 +13,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.util.PipeUtilBase; public abstract class StopWordsRemover implements Serializable{ diff --git a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java index 3feab36cb..d6783a40d 100644 --- a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java +++ b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java @@ -10,7 +10,7 @@ import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; -import zingg.common.core.Context; +import zingg.common.core.context.Context; public abstract class StopWordsRecommender { private static final String REGEX_WHITESPACE = "\\s+"; diff --git a/spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java b/spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java new file mode 100644 index 000000000..2a39c6942 --- /dev/null +++ b/spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java @@ -0,0 +1,95 @@ +package zingg.spark.core.context; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; + +import zingg.common.client.IZingg; +import zingg.common.client.ZinggClientException; +// +import zingg.common.core.context.Context; +import zingg.common.core.util.BlockingTreeUtil; +import zingg.common.core.util.DSUtil; +import zingg.common.core.util.GraphUtil; +import zingg.common.core.util.HashUtil; +import zingg.common.core.util.ModelUtil; +import zingg.common.core.util.PipeUtilBase; +import zingg.spark.core.util.SparkBlockingTreeUtil; +import zingg.spark.core.util.SparkDSUtil; +import zingg.spark.core.util.SparkGraphUtil; +import zingg.spark.core.util.SparkHashUtil; +import zingg.spark.core.util.SparkModelUtil; +import zingg.spark.core.util.SparkPipeUtil; + + +public class ZinggSparkContext extends Context, Row,Column,DataType>{ + + + private static final long serialVersionUID = 1L; + protected JavaSparkContext ctx; + public static final Log LOG = LogFactory.getLog(ZinggSparkContext.class); + + + + @Override + public void init() + throws ZinggClientException { + try{ + if (session==null) { + session = SparkSession + .builder() + .appName("Zingg") + .getOrCreate(); + + //session = new SparkSession(spark, license); + } + if (ctx==null) { + ctx = JavaSparkContext.fromSparkContext(session.sparkContext()); + JavaSparkContext.jarOfClass(IZingg.class); + LOG.debug("Context " + ctx.toString()); + //initHashFns(); + ctx.setCheckpointDir("/tmp/checkpoint"); + setUtils(); + } + } + catch(Throwable e) { + if (LOG.isDebugEnabled()) e.printStackTrace(); + throw new ZinggClientException(e.getMessage()); + } + } + + @Override + public void cleanup() { + try { + if (ctx != null) { + ctx.stop(); + } + if (session!=null) { + session.stop(); + } + ctx = null; + session = null; + } catch (Exception e) { + // ignore any exception in cleanup + e.printStackTrace(); + } + } + + @Override + public void setUtils() { + LOG.debug("Session passed to utils is " + session); + setPipeUtil(new SparkPipeUtil(session)); + setDSUtil(new SparkDSUtil(session)); + setHashUtil(new SparkHashUtil(session)); + setGraphUtil(new SparkGraphUtil()); + setModelUtil(new SparkModelUtil(session)); + setBlockingTreeUtil(new SparkBlockingTreeUtil(session, getPipeUtil())); + } + + + } \ No newline at end of file diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java index ec6ae2bc2..c9646da41 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java @@ -7,7 +7,7 @@ import freemarker.template.Version; import zingg.common.client.Arguments; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.documenter.DataColDocumenter; import zingg.common.core.documenter.RowWrapper; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java index 78b3e107a..465500879 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java @@ -7,7 +7,7 @@ import freemarker.template.Version; import zingg.common.client.Arguments; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.documenter.DataDocumenter; import zingg.common.core.documenter.RowWrapper; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java index 1145f9408..741ed3dc3 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java @@ -7,7 +7,7 @@ import freemarker.template.Version; import zingg.common.client.Arguments; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.documenter.ModelColDocumenter; import zingg.common.core.documenter.RowWrapper; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java index e53ce48c4..cb03167f0 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java @@ -7,7 +7,7 @@ import freemarker.template.Version; import zingg.common.client.Arguments; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.documenter.ModelDocumenter; import zingg.common.core.documenter.RowWrapper; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 954302b24..b4d68f0e5 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -17,6 +17,7 @@ import org.apache.spark.sql.SparkSession; import zingg.spark.core.documenter.SparkDataDocumenter; import zingg.spark.core.documenter.SparkModelDocumenter; +import zingg.spark.core.context.ZinggSparkContext; public class SparkDocumenter extends Documenter, Row, Column,DataType> { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index 1deb9d8e7..06855915b 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -7,13 +7,15 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.SparkSession; import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.core.executor.FindAndLabeller; -import org.apache.spark.sql.SparkSession; +import zingg.spark.core.context.ZinggSparkContext; + public class SparkFindAndLabeller extends FindAndLabeller, Row, Column,DataType> { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java index 1a942a9b3..8d5e3b318 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java @@ -14,6 +14,7 @@ import zingg.common.client.pipe.Pipe; import zingg.common.core.executor.LabelUpdater; +import zingg.spark.core.context.ZinggSparkContext; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index 010205309..87cf07fc1 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -6,13 +6,15 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.SparkSession; import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; +import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.Labeller; -import org.apache.spark.sql.SparkSession; + /** * Spark specific implementation of Labeller diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index 0be72e618..500548d7c 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -16,6 +16,7 @@ import zingg.common.core.preprocess.StopWordsRemover; import org.apache.spark.sql.SparkSession; import zingg.spark.core.preprocess.SparkStopWordsRemover; +import zingg.spark.core.context.ZinggSparkContext; public class SparkLinker extends Linker, Row, Column,DataType> { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index 18e4f8594..a05f86f27 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -11,7 +11,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; - +import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.Matcher; import zingg.common.core.model.Model; import zingg.common.core.preprocess.StopWordsRemover; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index f4c2666e8..d23a6bb43 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -10,6 +10,7 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.SparkSession; import zingg.common.client.Arguments; import zingg.common.client.ClientOptions; @@ -17,7 +18,8 @@ import zingg.common.client.ZinggOptions; import zingg.common.core.executor.ZinggBase; -import org.apache.spark.sql.SparkSession; +import zingg.spark.core.context.ZinggSparkContext; + public class SparkPeekModel extends ZinggBase, Row, Column, DataType>{ diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index a156a07d8..480f4ec4f 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -14,6 +14,7 @@ import zingg.common.core.executor.Recommender; import zingg.common.core.recommender.StopWordsRecommender; import org.apache.spark.sql.SparkSession; +import zingg.spark.core.context.ZinggSparkContext; import zingg.spark.core.recommender.SparkStopWordsRecommender; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index 3a9143adc..adc05458c 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -12,6 +12,7 @@ import zingg.common.client.ZinggOptions; import zingg.common.core.executor.TrainMatcher; +import zingg.spark.core.context.ZinggSparkContext; import org.apache.spark.sql.SparkSession; public class SparkTrainMatcher extends TrainMatcher, Row, Column,DataType> { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index cdf3a3e66..e6d0af4cb 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -2,18 +2,20 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; + import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.SparkSession; import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; - +import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.Trainer; import zingg.common.core.preprocess.StopWordsRemover; -import org.apache.spark.sql.SparkSession; + import zingg.spark.core.preprocess.SparkStopWordsRemover; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index 6f8c92b0a..4e7c9ecb8 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; - +import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.TrainingDataFinder; import zingg.common.core.preprocess.StopWordsRemover; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java b/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java deleted file mode 100644 index 074806806..000000000 --- a/spark/core/src/main/java/zingg/spark/core/executor/ZinggSparkContext.java +++ /dev/null @@ -1,191 +0,0 @@ -package zingg.spark.core.executor; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataType; - -import zingg.common.client.IZingg; -import zingg.common.client.ZinggClientException; -// -import zingg.common.core.Context; -import zingg.common.core.util.BlockingTreeUtil; -import zingg.common.core.util.DSUtil; -import zingg.common.core.util.GraphUtil; -import zingg.common.core.util.HashUtil; -import zingg.common.core.util.ModelUtil; -import zingg.common.core.util.PipeUtilBase; -import zingg.spark.core.util.SparkBlockingTreeUtil; -import zingg.spark.core.util.SparkDSUtil; -import zingg.spark.core.util.SparkGraphUtil; -import zingg.spark.core.util.SparkHashUtil; -import zingg.spark.core.util.SparkModelUtil; -import zingg.spark.core.util.SparkPipeUtil; - - -public class ZinggSparkContext implements Context, Row,Column,DataType>{ - - - private static final long serialVersionUID = 1L; - protected JavaSparkContext ctx; - protected SparkSession zSession; - protected PipeUtilBase, Row, Column> pipeUtil; - protected HashUtil, Row, Column, DataType> hashUtil; - protected DSUtil, Row, Column> dsUtil; - protected GraphUtil, Row, Column> graphUtil; - protected ModelUtil, Row, Column> modelUtil; - protected BlockingTreeUtil, Row, Column, DataType> blockingTreeUtil; - - public static final String hashFunctionFile = "hashFunctions.json"; - - - public static final Log LOG = LogFactory.getLog(ZinggSparkContext.class); - - - public SparkSession getSession() { - return zSession; - } - - public void setSession(SparkSession spark) { - LOG.debug("Session passed to context is " + spark); - this.zSession = spark; - } - - - - @Override - public void init() - throws ZinggClientException { - try{ - if (zSession==null) { - zSession = SparkSession - .builder() - .appName("Zingg") - .getOrCreate(); - - //zSession = new SparkSession(spark, license); - } - if (ctx==null) { - ctx = JavaSparkContext.fromSparkContext(zSession.sparkContext()); - JavaSparkContext.jarOfClass(IZingg.class); - LOG.debug("Context " + ctx.toString()); - //initHashFns(); - ctx.setCheckpointDir("/tmp/checkpoint"); - setUtils(); - } - } - catch(Throwable e) { - if (LOG.isDebugEnabled()) e.printStackTrace(); - throw new ZinggClientException(e.getMessage()); - } - } - - @Override - public void cleanup() { - try { - if (ctx != null) { - ctx.stop(); - } - if (zSession!=null) { - zSession.stop(); - } - ctx = null; - zSession = null; - } catch (Exception e) { - // ignore any exception in cleanup - e.printStackTrace(); - } - } - - @Override - public void setUtils() { - LOG.debug("Session passed to utils is " + zSession); - setPipeUtil(new SparkPipeUtil(zSession)); - setDSUtil(new SparkDSUtil(zSession)); - setHashUtil(new SparkHashUtil(zSession)); - setGraphUtil(new SparkGraphUtil()); - setModelUtil(new SparkModelUtil(zSession)); - setBlockingTreeUtil(new SparkBlockingTreeUtil(zSession, getPipeUtil())); - } - - /** - public void initHashFns() throws ZinggClientException { - try { - //functions = Util.getFunctionList(this.functionFile); - hashFunctions = getHashUtil().getHashFunctionList(hashFunctionFile, getContext()); - } catch (Exception e) { - if (LOG.isDebugEnabled()) e.printStackTrace(); - throw new ZinggClientException("Unable to initialize base functions"); - } - } - */ - - - - public void setHashUtil(HashUtil, Row, Column, DataType> t) { - this.hashUtil = t; - } - - public void setGraphUtil(GraphUtil, Row, Column> t) { - this.graphUtil = t; - } - - - - public void setPipeUtil(PipeUtilBase, Row, Column> pipeUtil) { - this.pipeUtil = pipeUtil; - } - - - public void setDSUtil(DSUtil, Row, Column> pipeUtil) { - this.dsUtil = pipeUtil; - } - - public void setBlockingTreeUtil(BlockingTreeUtil, Row, Column, DataType> d) { - this.blockingTreeUtil = d; - } - - public void setModelUtil(ModelUtil, Row, Column> t) { - this.modelUtil = t; - } - - public ModelUtil, Row, Column> getModelUtil() { - return modelUtil; - } - - /* @Override - public void setSession(SparkSession session) { - this.spark = session; - } - */ - - @Override - public HashUtil, Row, Column, DataType> getHashUtil() { - return hashUtil; - } - - @Override - public GraphUtil, Row, Column> getGraphUtil() { - return graphUtil; - } - - @Override - public DSUtil, Row, Column> getDSUtil() { - return dsUtil; - } - - @Override - public PipeUtilBase, Row, Column> getPipeUtil() { - return pipeUtil; - } - - @Override - public BlockingTreeUtil, Row, Column, DataType> getBlockingTreeUtil() { - return blockingTreeUtil; - } - - } \ No newline at end of file diff --git a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java index a2f3fe666..27cd9c95d 100644 --- a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java +++ b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java @@ -15,7 +15,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZFrame; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.preprocess.StopWordsRemover; import zingg.spark.client.SparkFrame; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java b/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java index 21f793d3f..fe1670fac 100644 --- a/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java @@ -8,7 +8,7 @@ import org.apache.spark.sql.types.DataType; import zingg.common.client.Arguments; -import zingg.common.core.Context; +import zingg.common.core.context.Context; import zingg.common.core.recommender.StopWordsRecommender; import org.apache.spark.sql.SparkSession; From a0659689022c7e2f2613da0a802d8f149252113c Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 6 Sep 2023 22:10:38 +0530 Subject: [PATCH 015/219] more im changes --- .../core/src/main/java/zingg/common/core/executor/Matcher.java | 2 +- .../core/src/main/java/zingg/common/core/util/GraphUtil.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 776608597..516b5960e 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -231,7 +231,7 @@ public ZFrame getObvDupePairs(ZFrame blocked) throws ZinggClientEx return onlyIds; } - public ZFrame getGraph(ZFrame blocked, ZFrame dupesActual){ + public ZFrame getGraph(ZFrame blocked, ZFrame dupesActual) throws ZinggClientException{ return getGraphUtil().buildGraph(blocked, dupesActual).cache(); } diff --git a/common/core/src/main/java/zingg/common/core/util/GraphUtil.java b/common/core/src/main/java/zingg/common/core/util/GraphUtil.java index d91b59bbd..69d72db30 100644 --- a/common/core/src/main/java/zingg/common/core/util/GraphUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/GraphUtil.java @@ -1,10 +1,11 @@ package zingg.common.core.util; import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; public interface GraphUtil { - public ZFrame buildGraph(ZFrame vertices, ZFrameedges) ; + public ZFrame buildGraph(ZFrame vertices, ZFrameedges) throws ZinggClientException ; /* From 0bb2e969c698f70fee9b7074369425a92050a5f5 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Fri, 8 Sep 2023 14:16:18 +0530 Subject: [PATCH 016/219] options change --- .../main/java/zingg/common/client/Client.java | 2 + .../zingg/common/client/ClientOptions.java | 1 + .../zingg/common/client/IZinggFactory.java | 4 +- .../zingg/common/client/ZinggOptions.java | 63 ------------------- .../common/client/options/ZinggOption.java | 15 +++++ .../common/client/options/ZinggOptions.java | 63 +++++++++++++++++++ .../java/zingg/common/client/TestClient.java | 2 + .../common/core/executor/Documenter.java | 4 +- .../common/core/executor/FindAndLabeller.java | 4 +- .../core/executor/LabelDataViewHelper.java | 3 +- .../common/core/executor/LabelUpdater.java | 4 +- .../zingg/common/core/executor/Labeller.java | 4 +- .../zingg/common/core/executor/Linker.java | 4 +- .../zingg/common/core/executor/Matcher.java | 4 +- .../common/core/executor/Recommender.java | 4 +- .../common/core/executor/TrainMatcher.java | 4 +- .../core/executor/TrainingDataFinder.java | 4 +- .../core/executor/TrainingDataModel.java | 3 +- .../zingg/common/core/executor/ZinggBase.java | 11 ++-- .../spark/core/executor/SparkDocumenter.java | 4 +- .../core/executor/SparkFindAndLabeller.java | 4 +- .../core/executor/SparkLabelUpdater.java | 4 +- .../spark/core/executor/SparkLabeller.java | 4 +- .../spark/core/executor/SparkLinker.java | 4 +- .../spark/core/executor/SparkMatcher.java | 4 +- .../spark/core/executor/SparkPeekModel.java | 4 +- .../spark/core/executor/SparkRecommender.java | 4 +- .../core/executor/SparkTrainMatcher.java | 4 +- .../spark/core/executor/SparkTrainer.java | 4 +- .../executor/SparkTrainingDataFinder.java | 4 +- .../spark/core/executor/SparkZFactory.java | 7 ++- 31 files changed, 137 insertions(+), 117 deletions(-) delete mode 100644 common/client/src/main/java/zingg/common/client/ZinggOptions.java create mode 100644 common/client/src/main/java/zingg/common/client/options/ZinggOption.java create mode 100644 common/client/src/main/java/zingg/common/client/options/ZinggOptions.java diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index ba1287d72..69e605803 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -5,6 +5,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.Email; import zingg.common.client.util.EmailBody; @@ -91,6 +92,7 @@ public void setZingg(Arguments args, ClientOptions options) throws Exception{ setZingg(zf.get(ZinggOptions.getByValue(ZinggOptions.PEEK_MODEL.getName()))); } } + public void setZingg(IZingg zingg) { this.zingg = zingg; diff --git a/common/client/src/main/java/zingg/common/client/ClientOptions.java b/common/client/src/main/java/zingg/common/client/ClientOptions.java index 8fb8dbf8f..e35d4a541 100644 --- a/common/client/src/main/java/zingg/common/client/ClientOptions.java +++ b/common/client/src/main/java/zingg/common/client/ClientOptions.java @@ -12,6 +12,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.Util; public class ClientOptions { diff --git a/common/client/src/main/java/zingg/common/client/IZinggFactory.java b/common/client/src/main/java/zingg/common/client/IZinggFactory.java index 427cbf35d..02a4b8d9c 100644 --- a/common/client/src/main/java/zingg/common/client/IZinggFactory.java +++ b/common/client/src/main/java/zingg/common/client/IZinggFactory.java @@ -1,9 +1,9 @@ package zingg.common.client; -import zingg.common.client.IZingg; +import zingg.common.client.options.ZinggOption; public interface IZinggFactory { - public IZingg get(ZinggOptions z) throws InstantiationException, IllegalAccessException, ClassNotFoundException; + public IZingg get(ZinggOption z) throws InstantiationException, IllegalAccessException, ClassNotFoundException; } diff --git a/common/client/src/main/java/zingg/common/client/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/ZinggOptions.java deleted file mode 100644 index 8a3ae43a7..000000000 --- a/common/client/src/main/java/zingg/common/client/ZinggOptions.java +++ /dev/null @@ -1,63 +0,0 @@ -package zingg.common.client; - -import java.util.HashMap; -import java.util.Map; - -import zingg.common.client.util.Util; - -public class ZinggOptions { - - public final static ZinggOptions TRAIN = new ZinggOptions("train"); - public final static ZinggOptions MATCH = new ZinggOptions("match"); - public final static ZinggOptions TRAIN_MATCH = new ZinggOptions("trainMatch"); - public final static ZinggOptions FIND_TRAINING_DATA = new ZinggOptions("findTrainingData"); - public final static ZinggOptions LABEL = new ZinggOptions("label"); - public final static ZinggOptions LINK = new ZinggOptions("link"); - public final static ZinggOptions GENERATE_DOCS = new ZinggOptions("generateDocs"); - public final static ZinggOptions RECOMMEND = new ZinggOptions("recommend"); - public final static ZinggOptions UPDATE_LABEL = new ZinggOptions("updateLabel"); - public final static ZinggOptions FIND_AND_LABEL = new ZinggOptions("findAndLabel"); - public final static ZinggOptions ASSESS_MODEL = new ZinggOptions("assessModel"); - public final static ZinggOptions PEEK_MODEL = new ZinggOptions("peekModel"); - public final static ZinggOptions EXPORT_MODEL = new ZinggOptions("exportModel"); - - public static Map allZinggOptions = new HashMap(); - - String name; - - public ZinggOptions(String name) { - this.name = name; - allZinggOptions.put(name, this); - } - - - - public static String[] getAllZinggOptions() { - ZinggOptions[] zo = allZinggOptions.values().toArray(new ZinggOptions[allZinggOptions.size()]); - int i = 0; - String[] s = new String[zo.length]; - for (ZinggOptions z: zo) { - s[i++] = z.getName(); - } - return s; - } - - public String getName() { - return name; - } - - public static final ZinggOptions getByValue(String value){ - for (ZinggOptions zo: ZinggOptions.allZinggOptions.values()) { - if (zo.name.equals(value)) return zo; - } - return null; - } - - public static void verifyPhase(String phase) throws ZinggClientException { - if (getByValue(phase) == null) { - String message = "'" + phase + "' is not a valid phase. " - + "Valid phases are: " + Util.join(getAllZinggOptions(), "|"); - throw new ZinggClientException(message); - } - } -} \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/options/ZinggOption.java b/common/client/src/main/java/zingg/common/client/options/ZinggOption.java new file mode 100644 index 000000000..d15a4518f --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/options/ZinggOption.java @@ -0,0 +1,15 @@ +package zingg.common.client.options; + +public class ZinggOption { + String name; + + public ZinggOption(String name) { + this.name = name; + ZinggOptions.put(this); + } + + public String getName() { + return name; + } + +} diff --git a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java new file mode 100644 index 000000000..821d867e7 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java @@ -0,0 +1,63 @@ +package zingg.common.client.options; + +import java.util.HashMap; +import java.util.Map; + +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.Util; + +public class ZinggOptions { + + public final static ZinggOption TRAIN = new ZinggOption("train"); + public final static ZinggOption MATCH = new ZinggOption("match"); + public final static ZinggOption TRAIN_MATCH = new ZinggOption("trainMatch"); + public final static ZinggOption FIND_TRAINING_DATA = new ZinggOption("findTrainingData"); + public final static ZinggOption LABEL = new ZinggOption("label"); + public final static ZinggOption LINK = new ZinggOption("link"); + public final static ZinggOption GENERATE_DOCS = new ZinggOption("generateDocs"); + public final static ZinggOption RECOMMEND = new ZinggOption("recommend"); + public final static ZinggOption UPDATE_LABEL = new ZinggOption("updateLabel"); + public final static ZinggOption FIND_AND_LABEL = new ZinggOption("findAndLabel"); + public final static ZinggOption ASSESS_MODEL = new ZinggOption("assessModel"); + public final static ZinggOption PEEK_MODEL = new ZinggOption("peekModel"); + public final static ZinggOption EXPORT_MODEL = new ZinggOption("exportModel"); + + public static final Map allZinggOptions = new HashMap(); + + + + private ZinggOptions() { + } + + public static final void put(ZinggOption o) { + allZinggOptions.put(o.getName(), o); + } + + + + public static String[] getAllZinggOptions() { + ZinggOption[] zo = allZinggOptions.values().toArray(new ZinggOption[allZinggOptions.size()]); + int i = 0; + String[] s = new String[zo.length]; + for (ZinggOption z: zo) { + s[i++] = z.getName(); + } + return s; + } + + + public static final ZinggOption getByValue(String value){ + for (ZinggOption zo: ZinggOptions.allZinggOptions.values()) { + if (zo.name.equals(value)) return zo; + } + return null; + } + + public static void verifyPhase(String phase) throws ZinggClientException { + if (getByValue(phase) == null) { + String message = "'" + phase + "' is not a valid phase. " + + "Valid phases are: " + Util.join(getAllZinggOptions(), "|"); + throw new ZinggClientException(message); + } + } +} \ No newline at end of file diff --git a/common/client/src/test/java/zingg/common/client/TestClient.java b/common/client/src/test/java/zingg/common/client/TestClient.java index e22ff3c21..5a3befd85 100644 --- a/common/client/src/test/java/zingg/common/client/TestClient.java +++ b/common/client/src/test/java/zingg/common/client/TestClient.java @@ -6,6 +6,8 @@ import org.apache.commons.logging.LogFactory; import org.junit.jupiter.api.Test; +import zingg.common.client.options.ZinggOptions; + public class TestClient { public static final Log LOG = LogFactory.getLog(TestClient.class); diff --git a/common/core/src/main/java/zingg/common/core/executor/Documenter.java b/common/core/src/main/java/zingg/common/core/executor/Documenter.java index 6e80b8aa7..2841720e5 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Documenter.java +++ b/common/core/src/main/java/zingg/common/core/executor/Documenter.java @@ -4,7 +4,7 @@ import org.apache.commons.logging.LogFactory; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.documenter.DataDocumenter; import zingg.common.core.documenter.ModelDocumenter; @@ -14,7 +14,7 @@ public abstract class Documenter extends ZinggBase { public static final Log LOG = LogFactory.getLog(Documenter.class); public Documenter() { - setZinggOptions(ZinggOptions.GENERATE_DOCS); + setZinggOption(ZinggOptions.GENERATE_DOCS); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java index 899956d57..f010468d2 100644 --- a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java @@ -5,7 +5,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; public abstract class FindAndLabeller extends ZinggBase { private static final long serialVersionUID = 1L; @@ -16,7 +16,7 @@ public abstract class FindAndLabeller extends ZinggBase labeller; public FindAndLabeller() { - setZinggOptions(ZinggOptions.FIND_AND_LABEL); + setZinggOption(ZinggOptions.FIND_AND_LABEL); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java index c63f7dd6a..02e1a8a1a 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java @@ -10,7 +10,7 @@ import zingg.common.client.ILabelDataViewHelper; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; import zingg.common.core.context.Context; @@ -23,7 +23,6 @@ public class LabelDataViewHelper extends ZinggBase imp public LabelDataViewHelper(Context context, ClientOptions clientOptions) { setContext(context); - setZinggOptions(zinggOptions); setClientOptions(clientOptions); setName(this.getClass().getName()); } diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java index 0615819d4..3d8878672 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java @@ -8,7 +8,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.pipe.Pipe; import zingg.common.client.util.ColName; import zingg.common.core.util.LabelMatchType; @@ -19,7 +19,7 @@ public abstract class LabelUpdater extends Labeller { public static final Log LOG = LogFactory.getLog(LabelUpdater.class); public LabelUpdater() { - setZinggOptions(ZinggOptions.UPDATE_LABEL); + setZinggOption(ZinggOptions.UPDATE_LABEL); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/Labeller.java b/common/core/src/main/java/zingg/common/core/executor/Labeller.java index cc8cda8c4..f58020a11 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Labeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/Labeller.java @@ -10,7 +10,7 @@ import zingg.common.client.ITrainingDataModel; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; public abstract class Labeller extends ZinggBase { @@ -24,7 +24,7 @@ public abstract class Labeller extends ZinggBase { protected ILabelDataViewHelper labelDataViewHelper; public Labeller() { - setZinggOptions(ZinggOptions.LABEL); + setZinggOption(ZinggOptions.LABEL); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 7e7f402ca..ed8c6fad1 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -5,7 +5,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; @@ -17,7 +17,7 @@ public abstract class Linker extends Matcher { public static final Log LOG = LogFactory.getLog(Linker.class); public Linker() { - setZinggOptions(ZinggOptions.LINK); + setZinggOption(ZinggOptions.LINK); } public ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 516b5960e..e4add807a 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -8,7 +8,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; import zingg.common.core.block.Canopy; @@ -25,7 +25,7 @@ public abstract class Matcher extends ZinggBase{ public static final Log LOG = LogFactory.getLog(Matcher.class); public Matcher() { - setZinggOptions(ZinggOptions.MATCH); + setZinggOption(ZinggOptions.MATCH); } public ZFrame getTestData() throws ZinggClientException{ diff --git a/common/core/src/main/java/zingg/common/core/executor/Recommender.java b/common/core/src/main/java/zingg/common/core/executor/Recommender.java index 7119a1182..cc870c41a 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Recommender.java +++ b/common/core/src/main/java/zingg/common/core/executor/Recommender.java @@ -4,7 +4,7 @@ import org.apache.commons.logging.LogFactory; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.recommender.StopWordsRecommender; public abstract class Recommender extends ZinggBase { @@ -13,7 +13,7 @@ public abstract class Recommender extends ZinggBase { public static final Log LOG = LogFactory.getLog(Recommender.class); public Recommender() { - setZinggOptions(ZinggOptions.RECOMMEND); + setZinggOption(ZinggOptions.RECOMMEND); } public void execute() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java index cf65b5ffc..6d7d4bdf7 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java @@ -4,7 +4,7 @@ import org.apache.commons.logging.LogFactory; import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; public abstract class TrainMatcher extends ZinggBase{ @@ -16,7 +16,7 @@ public abstract class TrainMatcher extends ZinggBase{ protected Matcher matcher; public TrainMatcher() { - setZinggOptions(ZinggOptions.TRAIN_MATCH); + setZinggOption(ZinggOptions.TRAIN_MATCH); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java index 0f93c723d..cab1139a7 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java @@ -5,7 +5,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.pipe.Pipe; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; @@ -20,7 +20,7 @@ public abstract class TrainingDataFinder extends ZinggBase public static final Log LOG = LogFactory.getLog(TrainingDataFinder.class); public TrainingDataFinder() { - setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); + setZinggOption(ZinggOptions.FIND_TRAINING_DATA); } public ZFrame getTraining() throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java index b50bbbe8f..b57c4916b 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java @@ -8,7 +8,7 @@ import zingg.common.client.ITrainingDataModel; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.pipe.Pipe; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; @@ -23,7 +23,6 @@ public class TrainingDataModel extends ZinggBase imple public TrainingDataModel(Context context, ClientOptions clientOptions) { setContext(context); - setZinggOptions(zinggOptions); setClientOptions(clientOptions); setName(this.getClass().getName()); } diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index 49b0a4d0a..674eb6106 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -13,7 +13,8 @@ import zingg.common.client.MatchType; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOption; +import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; import zingg.common.core.context.Context; @@ -33,7 +34,7 @@ public abstract class ZinggBase implements Serializable, IZingg context; protected String name; - protected ZinggOptions zinggOptions; + protected ZinggOption zinggOption; protected long startTime; protected ClientOptions clientOptions; @@ -85,7 +86,7 @@ public void postMetrics() { Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics); Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics); - Analytics.postEvent(zinggOptions.getName(), collectMetrics); + Analytics.postEvent(zinggOption.getName(), collectMetrics); } public Arguments getArgs() { @@ -110,8 +111,8 @@ public void setName(String name) { this.name = name; } - public void setZinggOptions(ZinggOptions zinggOptions) { - this.zinggOptions = zinggOptions; + public void setZinggOption(ZinggOption zinggOptions) { + this.zinggOption = zinggOptions; } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index b4d68f0e5..7e6cab68f 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -9,7 +9,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.documenter.DataDocumenter; import zingg.common.core.documenter.ModelDocumenter; @@ -27,7 +27,7 @@ public class SparkDocumenter extends Documenter, Row, public static final Log LOG = LogFactory.getLog(SparkDocumenter.class); public SparkDocumenter() { - setZinggOptions(ZinggOptions.GENERATE_DOCS); + setZinggOption(ZinggOptions.GENERATE_DOCS); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index 06855915b..9e3f6e976 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -11,7 +11,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.executor.FindAndLabeller; import zingg.spark.core.context.ZinggSparkContext; @@ -24,7 +24,7 @@ public class SparkFindAndLabeller extends FindAndLabeller, public static final Log LOG = LogFactory.getLog(SparkLabelUpdater.class); public SparkLabelUpdater() { - setZinggOptions(ZinggOptions.UPDATE_LABEL); + setZinggOption(ZinggOptions.UPDATE_LABEL); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index 87cf07fc1..d2aa8540d 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.Labeller; @@ -32,7 +32,7 @@ public SparkLabeller() { } public SparkLabeller(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.LABEL); + setZinggOption(ZinggOptions.LABEL); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index 500548d7c..b027915e0 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -9,7 +9,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.executor.Linker; import zingg.common.core.model.Model; @@ -26,7 +26,7 @@ public class SparkLinker extends Linker, Row, Column, public static final Log LOG = LogFactory.getLog(SparkLinker.class); public SparkLinker() { - setZinggOptions(ZinggOptions.LINK); + setZinggOption(ZinggOptions.LINK); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index a05f86f27..a3aba2c31 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -10,7 +10,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.Matcher; import zingg.common.core.model.Model; @@ -35,7 +35,7 @@ public SparkMatcher() { } public SparkMatcher(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.MATCH); + setZinggOption(ZinggOptions.MATCH); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index d23a6bb43..230a30016 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -15,7 +15,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ClientOptions; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.executor.ZinggBase; import zingg.spark.core.context.ZinggSparkContext; @@ -28,7 +28,7 @@ public class SparkPeekModel extends ZinggBase, Row, C public static final Log LOG = LogFactory.getLog(SparkPeekModel.class); public SparkPeekModel() { - setZinggOptions(ZinggOptions.PEEK_MODEL); + setZinggOption(ZinggOptions.PEEK_MODEL); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 480f4ec4f..3f37eb0c3 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -9,7 +9,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.executor.Recommender; import zingg.common.core.recommender.StopWordsRecommender; @@ -29,7 +29,7 @@ public class SparkRecommender extends Recommender, Ro public static final Log LOG = LogFactory.getLog(SparkRecommender.class); public SparkRecommender() { - setZinggOptions(ZinggOptions.RECOMMEND); + setZinggOption(ZinggOptions.RECOMMEND); setContext(new ZinggSparkContext()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index adc05458c..4f724bc15 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -9,7 +9,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.common.core.executor.TrainMatcher; import zingg.spark.core.context.ZinggSparkContext; @@ -22,7 +22,7 @@ public class SparkTrainMatcher extends TrainMatcher, public static final Log LOG = LogFactory.getLog(SparkTrainMatcher.class); public SparkTrainMatcher() { - setZinggOptions(ZinggOptions.TRAIN_MATCH); + setZinggOption(ZinggOptions.TRAIN_MATCH); ZinggSparkContext sparkContext = new ZinggSparkContext(); setContext(sparkContext); trainer = new SparkTrainer(sparkContext); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index e6d0af4cb..4ddb00ddb 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -11,7 +11,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.Trainer; import zingg.common.core.preprocess.StopWordsRemover; @@ -30,7 +30,7 @@ public SparkTrainer() { } public SparkTrainer(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.TRAIN); + setZinggOption(ZinggOptions.TRAIN); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index 4e7c9ecb8..a61cad5f1 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -9,7 +9,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOptions; import zingg.spark.core.context.ZinggSparkContext; import zingg.common.core.executor.TrainingDataFinder; import zingg.common.core.preprocess.StopWordsRemover; @@ -27,7 +27,7 @@ public SparkTrainingDataFinder() { } public SparkTrainingDataFinder(ZinggSparkContext sparkContext) { - setZinggOptions(ZinggOptions.FIND_TRAINING_DATA); + setZinggOption(ZinggOptions.FIND_TRAINING_DATA); setContext(sparkContext); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java index a64570f45..5e9079796 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java @@ -4,7 +4,8 @@ import zingg.common.client.IZingg; import zingg.common.client.IZinggFactory; -import zingg.common.client.ZinggOptions; +import zingg.common.client.options.ZinggOption; +import zingg.common.client.options.ZinggOptions; import zingg.spark.core.executor.SparkDocumenter; import zingg.spark.core.executor.SparkFindAndLabeller; import zingg.spark.core.executor.SparkLabelUpdater; @@ -20,7 +21,7 @@ public class SparkZFactory implements IZinggFactory{ public SparkZFactory() {} - public static HashMap zinggers = new HashMap(); + public static HashMap zinggers = new HashMap(); static { zinggers.put(ZinggOptions.TRAIN, SparkTrainer.name); @@ -36,7 +37,7 @@ public SparkZFactory() {} zinggers.put(ZinggOptions.PEEK_MODEL, SparkPeekModel.name); } - public IZingg get(ZinggOptions z) throws InstantiationException, IllegalAccessException, ClassNotFoundException { + public IZingg get(ZinggOption z) throws InstantiationException, IllegalAccessException, ClassNotFoundException { return (IZingg) Class.forName(zinggers.get(z)).newInstance(); } From 9033c033388515e6489c64949f58d70a5347dbbb Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Fri, 8 Sep 2023 21:55:22 +0530 Subject: [PATCH 017/219] ftd triggered --- .../main/java/zingg/common/client/options/ZinggOption.java | 4 ++++ .../main/java/zingg/common/client/options/ZinggOptions.java | 5 ++++- .../java/zingg/common/core/executor/TrainingDataFinder.java | 4 +--- .../main/java/zingg/spark/core/executor/SparkDocumenter.java | 2 +- .../java/zingg/spark/core/executor/SparkFindAndLabeller.java | 2 +- .../java/zingg/spark/core/executor/SparkLabelUpdater.java | 2 +- .../main/java/zingg/spark/core/executor/SparkLabeller.java | 2 +- .../src/main/java/zingg/spark/core/executor/SparkLinker.java | 2 +- .../main/java/zingg/spark/core/executor/SparkPeekModel.java | 2 +- .../java/zingg/spark/core/executor/SparkRecommender.java | 2 +- .../java/zingg/spark/core/executor/SparkTrainMatcher.java | 2 +- .../main/java/zingg/spark/core/executor/SparkTrainer.java | 2 +- .../zingg/spark/core/executor/SparkTrainingDataFinder.java | 4 ++-- 13 files changed, 20 insertions(+), 15 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/options/ZinggOption.java b/common/client/src/main/java/zingg/common/client/options/ZinggOption.java index d15a4518f..2b3ba2999 100644 --- a/common/client/src/main/java/zingg/common/client/options/ZinggOption.java +++ b/common/client/src/main/java/zingg/common/client/options/ZinggOption.java @@ -12,4 +12,8 @@ public String getName() { return name; } + @Override + public String toString(){ + return name; + } } diff --git a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java index 821d867e7..b5daec200 100644 --- a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java +++ b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java @@ -22,7 +22,7 @@ public class ZinggOptions { public final static ZinggOption PEEK_MODEL = new ZinggOption("peekModel"); public final static ZinggOption EXPORT_MODEL = new ZinggOption("exportModel"); - public static final Map allZinggOptions = new HashMap(); + public static Map allZinggOptions;// = new HashMap(); @@ -30,6 +30,9 @@ private ZinggOptions() { } public static final void put(ZinggOption o) { + if (allZinggOptions == null) { + allZinggOptions = new HashMap(); + } allZinggOptions.put(o.getName(), o); } diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java index cab1139a7..2c09d31f3 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java @@ -79,17 +79,15 @@ public void execute() throws ZinggClientException { ZFrame sample = getStopWords().preprocessForStopWords(sampleOrginal); Tree> tree = getBlockingTreeUtil().createBlockingTree(sample, posPairs, 1, -1, args, getHashUtil().getHashFunctionList()); - tree.print(2); + //tree.print(2); ZFrame blocked = getBlockingTreeUtil().getBlockHashes(sample, tree); blocked = blocked.repartition(args.getNumPartitions(), blocked.col(ColName.HASH_COL)).cache(); - System.out.println("blocked"); if (LOG.isDebugEnabled()) { blocked.show(true); } ZFrame blocks = getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true); blocks = blocks.cache(); - System.out.println("blocks"); if (LOG.isDebugEnabled()) { blocks.show(); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 7e6cab68f..bcb3b7753 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -34,7 +34,7 @@ public SparkDocumenter() { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index 9e3f6e976..2f9a6b0b2 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -34,7 +34,7 @@ public SparkFindAndLabeller() { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java index 6357627c3..70092a74f 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java @@ -38,7 +38,7 @@ public SparkLabelUpdater() { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } protected Pipe setSaveModeOnPipe(Pipe p) { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index d2aa8540d..f23d7cec5 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -39,7 +39,7 @@ public SparkLabeller(ZinggSparkContext sparkContext) { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index b027915e0..524db6ae2 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -33,7 +33,7 @@ public SparkLinker() { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index 230a30016..092d27baf 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -40,7 +40,7 @@ public void init(Arguments args) getContext().setUtils(); //we wil not init here as we wnt py to drive //the spark session etc - //getContext().init(license); + getContext().init(); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 3f37eb0c3..8f22a0bcf 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -36,7 +36,7 @@ public SparkRecommender() { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index 4f724bc15..bc3d26de1 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -32,7 +32,7 @@ public SparkTrainMatcher() { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index 4ddb00ddb..86c57c842 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -37,7 +37,7 @@ public SparkTrainer(ZinggSparkContext sparkContext) { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index a61cad5f1..49465989b 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -27,14 +27,14 @@ public SparkTrainingDataFinder() { } public SparkTrainingDataFinder(ZinggSparkContext sparkContext) { - setZinggOption(ZinggOptions.FIND_TRAINING_DATA); + super(); setContext(sparkContext); } @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - //getContext().init(license); + getContext().init(); } @Override From 8947c24044539bb5d579522a0eedc1b32908d6ca Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sat, 9 Sep 2023 20:22:09 +0530 Subject: [PATCH 018/219] zinggoptions working --- .../src/main/java/zingg/common/client/options/ZinggOptions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java index b5daec200..d4c98ed1e 100644 --- a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java +++ b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java @@ -26,7 +26,7 @@ public class ZinggOptions { - private ZinggOptions() { + protected ZinggOptions() { } public static final void put(ZinggOption o) { From 342571995320779abe988816eb56693e09af7ad4 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sat, 9 Sep 2023 20:26:22 +0530 Subject: [PATCH 019/219] matcher works --- log4j.properties | 2 +- .../src/main/java/zingg/spark/core/executor/SparkMatcher.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/log4j.properties b/log4j.properties index 4e3f869d8..acb80463c 100644 --- a/log4j.properties +++ b/log4j.properties @@ -21,7 +21,7 @@ log4j.logger.org.apache.hadoop=WARN log4j.logger.org.apache.spark=WARN, FILE log4j.logger.org.apache.spark.ml=WARN, FILE log4j.logger.org.apache.parquet.hadoop=WARN, FILE -log4j.logger.org.graphframes=INFO, FILE +log4j.logger.org.graphframes=WARN, FILE log4j.logger.org.spark_project=OFF log4j.logger.org.sparkproject=OFF log4j.logger.org.elasticsearch.hadoop=INFO diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index a3aba2c31..a5b09d9ca 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -42,7 +42,7 @@ public SparkMatcher(ZinggSparkContext sparkContext) { @Override public void init(Arguments args) throws ZinggClientException { super.init(args); - // getContext().init(license); + getContext().init(); } From 566c11d53cd65ffe36e63a5f5e0cd719e1bf210a Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sat, 18 Nov 2023 09:06:29 +0530 Subject: [PATCH 020/219] removed diamond operator without classes --- .../src/main/java/zingg/common/client/util/CliUtils.java | 6 +++--- .../java/zingg/spark/core/util/SparkBlockingTreeUtil.java | 2 +- spark/core/src/test/java/zingg/block/TestBlock.java | 2 +- .../java/zingg/common/core/preprocess/TestStopWords.java | 2 +- .../common/core/recommender/TestStopWordsRecommender.java | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/util/CliUtils.java b/common/client/src/main/java/zingg/common/client/util/CliUtils.java index 0296cc1cd..19c01eed0 100644 --- a/common/client/src/main/java/zingg/common/client/util/CliUtils.java +++ b/common/client/src/main/java/zingg/common/client/util/CliUtils.java @@ -49,7 +49,7 @@ public static void formatIntoTable(List table) { } private static List prepareColumnTexts(List table, int maxColumnWidth) { - List finalTableList = new ArrayList<>(); + List finalTableList = new ArrayList(); for (String[] row : table) { // If any cell data is more than max width, then it will need extra row. @@ -129,7 +129,7 @@ private static Map calculateColumnsLength(List table * * Map columnLengths is */ - Map columnLengths = new HashMap<>(); + Map columnLengths = new HashMap(); for (String[] row: table) { int i = 0; for (String col : row) { @@ -187,7 +187,7 @@ public static void testFunc() { /* * Create new table array with wrapped rows */ - ArrayList tableList = new ArrayList<>(Arrays.asList(table)); // Input + ArrayList tableList = new ArrayList(Arrays.asList(table)); // Input formatIntoTable(tableList); } } \ No newline at end of file diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java index 2cc13a4d9..d330c2c04 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java @@ -60,7 +60,7 @@ public StructType appendHashCol(StructType s) { @Override public ZFrame, Row, Column> getTreeDF(byte[] blockingTree){ StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("BlockingTree", DataTypes.BinaryType, false) }); - List objList = new ArrayList<>(); + List objList = new ArrayList(); objList.add(RowFactory.create(blockingTree)); Dataset df = spark.sqlContext().createDataFrame(objList, schema).toDF().coalesce(1); return new SparkFrame(df); diff --git a/spark/core/src/test/java/zingg/block/TestBlock.java b/spark/core/src/test/java/zingg/block/TestBlock.java index 504b694ac..89276b548 100644 --- a/spark/core/src/test/java/zingg/block/TestBlock.java +++ b/spark/core/src/test/java/zingg/block/TestBlock.java @@ -85,7 +85,7 @@ private Arguments getArguments() throws ZinggClientException { } private List getFieldDefList() { - List fdList = new ArrayList<>(4); + List fdList = new ArrayList(4); FieldDefinition idFD = new FieldDefinition(); idFD.setDataType("integer"); diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java index 77d872e88..7df457474 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java @@ -57,7 +57,7 @@ public void testStopWordsSingleColumn() throws ZinggClientException { RowFactory.create("best luck to zingg")), schema); - List fdList = new ArrayList<>(4); + List fdList = new ArrayList(4); ArrayList matchTypelistFuzzy = new ArrayList(); matchTypelistFuzzy.add(MatchType.FUZZY); diff --git a/spark/core/src/test/java/zingg/common/core/recommender/TestStopWordsRecommender.java b/spark/core/src/test/java/zingg/common/core/recommender/TestStopWordsRecommender.java index b64fd6d86..7578e4add 100644 --- a/spark/core/src/test/java/zingg/common/core/recommender/TestStopWordsRecommender.java +++ b/spark/core/src/test/java/zingg/common/core/recommender/TestStopWordsRecommender.java @@ -30,7 +30,7 @@ public class TestStopWordsRecommender extends ZinggSparkTester { StopWordsRecommender recommender = new SparkStopWordsRecommender(zsCTX, args); Dataset dataset = createDFWithGivenStopWords(); - List stopwordRow= new ArrayList<>(); + List stopwordRow= new ArrayList(); List stopwordList = new ArrayList(); Dataset stopWords; From eb4fea823262cd06e0363d8db4bc526f80692dd3 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 28 Nov 2023 18:32:02 +0530 Subject: [PATCH 021/219] added business exception class, issue #730 --- .../common/client/ZinggBusinessException.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 common/client/src/main/java/zingg/common/client/ZinggBusinessException.java diff --git a/common/client/src/main/java/zingg/common/client/ZinggBusinessException.java b/common/client/src/main/java/zingg/common/client/ZinggBusinessException.java new file mode 100644 index 000000000..37d592c65 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/ZinggBusinessException.java @@ -0,0 +1,21 @@ +package zingg.common.client; + +import java.io.Serializable; + +/** + * To be thrown in case of business scenario which needs graceful handling + * + */ +public class ZinggBusinessException extends Exception implements Serializable { + + private static final long serialVersionUID = 1L; + + public ZinggBusinessException(String message) { + super(message); + } + + public ZinggBusinessException(String message, Throwable cause) { + super(message, cause); + } + +} From dc794dbaff2f9c1206b213c17bacbeeea05197f8 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 29 Nov 2023 10:46:15 +0530 Subject: [PATCH 022/219] removed redundant implement of interface Serializable --- .../java/zingg/common/client/ZinggBusinessException.java | 4 +--- .../main/java/zingg/common/client/ZinggClientException.java | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ZinggBusinessException.java b/common/client/src/main/java/zingg/common/client/ZinggBusinessException.java index 37d592c65..e33fd5683 100644 --- a/common/client/src/main/java/zingg/common/client/ZinggBusinessException.java +++ b/common/client/src/main/java/zingg/common/client/ZinggBusinessException.java @@ -1,12 +1,10 @@ package zingg.common.client; -import java.io.Serializable; - /** * To be thrown in case of business scenario which needs graceful handling * */ -public class ZinggBusinessException extends Exception implements Serializable { +public class ZinggBusinessException extends Exception { private static final long serialVersionUID = 1L; diff --git a/common/client/src/main/java/zingg/common/client/ZinggClientException.java b/common/client/src/main/java/zingg/common/client/ZinggClientException.java index 6791b6d42..6535cda76 100644 --- a/common/client/src/main/java/zingg/common/client/ZinggClientException.java +++ b/common/client/src/main/java/zingg/common/client/ZinggClientException.java @@ -1,7 +1,5 @@ package zingg.common.client; -import java.io.Serializable; - /** * Base class for all Zingg Exceptions * @@ -9,8 +7,10 @@ * */ -public class ZinggClientException extends Throwable implements Serializable { +public class ZinggClientException extends Throwable { + private static final long serialVersionUID = 1L; + public String message; public ZinggClientException(String m) { From 10a5255365f48ea3012e3e2f2b9755eaf8297596 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 4 Dec 2023 17:39:32 +0530 Subject: [PATCH 023/219] refactor get data into a seperate method --- .../java/zingg/common/core/executor/TrainingDataFinder.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java index 30181dcc5..289a7ae50 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java @@ -29,9 +29,13 @@ public ZFrame getTraining() throws ZinggClientException { return getDSUtil().getTraining(getPipeUtil(), args); } + protected ZFrame getData() throws ZinggClientException { + return getPipeUtil().read(true, true, args.getData()); + } + public void execute() throws ZinggClientException { try{ - ZFrame data = getPipeUtil().read(true, true, args.getData()); + ZFrame data = getData(); LOG.warn("Read input data " + data.count()); LOG.debug("input data schema is " +data.showSchema()); //create 20 pos pairs From a83a2c33042418bd291af2a54490388fa5ca9612 Mon Sep 17 00:00:00 2001 From: Sonal Date: Tue, 5 Dec 2023 22:22:33 +0530 Subject: [PATCH 024/219] Update codeql.yml --- .github/workflows/codeql.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 50a5cf2af..63477c448 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -13,10 +13,10 @@ name: "CodeQL" on: push: - branches: [ main ] + branches: [ main, 0.4.0 ] pull_request: # The branches below must be a subset of the branches above - branches: [ main ] + branches: [ main, 0.4.0 ] schedule: - cron: '22 3 * * 5' From 13da7f9e988118353a1526d39ac17ac468f4709b Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 11 Dec 2023 18:29:47 +0530 Subject: [PATCH 025/219] new phase approveClusters --- .../client/src/main/java/zingg/common/client/ZinggOptions.java | 1 + 1 file changed, 1 insertion(+) diff --git a/common/client/src/main/java/zingg/common/client/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/ZinggOptions.java index 48bb49613..8c3d32173 100644 --- a/common/client/src/main/java/zingg/common/client/ZinggOptions.java +++ b/common/client/src/main/java/zingg/common/client/ZinggOptions.java @@ -17,6 +17,7 @@ public enum ZinggOptions { ASSESS_MODEL("assessModel"), PEEK_MODEL("peekModel"), EXPORT_MODEL("exportModel"), + APPROVE_CLUSTERS("approveClusters"), RUN_INCREMENTAL("runIncremental"); private String value; From 08c3ab4ddbde348aacdc6d2f89f0c9c6e7fa0f57 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 12 Dec 2023 13:51:08 +0530 Subject: [PATCH 026/219] use IArguments i.e. driven by interface --- .../java/zingg/common/client/Arguments.java | 44 +++- .../zingg/common/client/ArgumentsUtil.java | 34 +-- .../main/java/zingg/common/client/Client.java | 18 +- .../java/zingg/common/client/IArguments.java | 233 ++++++++++++++++++ .../common/client/ILabelDataViewHelper.java | 2 +- .../common/client/ITrainingDataModel.java | 4 +- .../main/java/zingg/common/client/IZingg.java | 2 +- .../zingg/common/client/TestArguments.java | 16 +- .../core/documenter/DataColDocumenter.java | 4 +- .../core/documenter/DataDocumenter.java | 4 +- .../core/documenter/DocumenterBase.java | 4 +- .../core/documenter/ModelColDocumenter.java | 4 +- .../core/documenter/ModelDocumenter.java | 4 +- .../common/core/executor/FindAndLabeller.java | 4 +- .../core/executor/LabelDataViewHelper.java | 4 +- .../common/core/executor/TrainMatcher.java | 5 +- .../core/executor/TrainingDataModel.java | 8 +- .../zingg/common/core/executor/ZinggBase.java | 10 +- .../common/core/preprocess/StopWords.java | 4 +- .../core/preprocess/StopWordsRemover.java | 10 +- .../recommender/StopWordsRecommender.java | 6 +- .../common/core/util/BlockingTreeUtil.java | 10 +- .../java/zingg/common/core/util/DSUtil.java | 22 +- .../zingg/common/core/util/ModelUtil.java | 12 +- .../java/zingg/common/core/util/PipeUtil.java | 14 +- .../zingg/common/core/util/PipeUtilBase.java | 14 +- .../java/zingg/spark/client/SparkClient.java | 10 +- .../java/zingg/client/TestSparkFrameBase.java | 3 +- .../zingg/spark/client/TestArguments.java | 5 +- .../zingg/spark/client/TestSparkClient.java | 3 +- .../documenter/SparkDataColDocumenter.java | 4 +- .../core/documenter/SparkDataDocumenter.java | 4 +- .../documenter/SparkModelColDocumenter.java | 4 +- .../core/documenter/SparkModelDocumenter.java | 4 +- .../spark/core/executor/SparkDocumenter.java | 4 +- .../core/executor/SparkFindAndLabeller.java | 4 +- .../core/executor/SparkLabelUpdater.java | 4 +- .../spark/core/executor/SparkLabeller.java | 4 +- .../spark/core/executor/SparkLinker.java | 4 +- .../spark/core/executor/SparkMatcher.java | 4 +- .../spark/core/executor/SparkPeekModel.java | 4 +- .../spark/core/executor/SparkRecommender.java | 4 +- .../core/executor/SparkTrainMatcher.java | 4 +- .../spark/core/executor/SparkTrainer.java | 4 +- .../executor/SparkTrainingDataFinder.java | 4 +- .../preprocess/SparkStopWordsRemover.java | 4 +- .../SparkStopWordsRecommender.java | 4 +- .../zingg/spark/core/util/SparkModelUtil.java | 6 +- .../src/test/java/zingg/block/TestBlock.java | 8 +- .../core/documenter/TestDataDocumenter.java | 3 +- .../core/documenter/TestModelDocumenter.java | 3 +- .../common/core/preprocess/TestStopWords.java | 3 +- .../zingg/common/core/util/TestDSUtil.java | 5 +- .../zingg/common/core/util/TestPipeUtil.java | 3 +- .../spark/core/executor/ZinggSparkTester.java | 3 +- 55 files changed, 452 insertions(+), 167 deletions(-) create mode 100644 common/client/src/main/java/zingg/common/client/IArguments.java diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index b44f6f30b..a47323eef 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -79,7 +79,7 @@ * */ @JsonInclude(Include.NON_NULL) -public class Arguments implements Serializable { +public class Arguments implements Serializable, IArguments { private static final long serialVersionUID = 1L; // creates DriverArgs and invokes the main object @@ -103,6 +103,7 @@ public class Arguments implements Serializable { + @Override public void setThreshold(double threshold) { this.threshold = threshold; } @@ -115,10 +116,12 @@ public void setThreshold(double threshold) { public Arguments() { } + @Override public int getNumPartitions() { return numPartitions; } + @Override public void setNumPartitions(int numPartitions) throws ZinggClientException{ if (numPartitions != -1 && numPartitions <= 0) throw new ZinggClientException( @@ -134,6 +137,7 @@ public void setNumPartitions(int numPartitions) throws ZinggClientException{ * @return sample percent as a float between 0 and 1 */ + @Override public float getLabelDataSampleSize() { return labelDataSampleSize; } @@ -149,6 +153,7 @@ public float getLabelDataSampleSize() { * generating seed samples * @throws ZinggClientException */ + @Override public void setLabelDataSampleSize(float labelDataSampleSize) throws ZinggClientException { if (labelDataSampleSize > 1 || labelDataSampleSize < 0) throw new ZinggClientException("Label Data Sample Size should be between 0 and 1"); @@ -160,6 +165,7 @@ public void setLabelDataSampleSize(float labelDataSampleSize) throws ZinggClient * * @return list of field definitions */ + @Override public List getFieldDefinition() { return fieldDefinition; } @@ -173,6 +179,7 @@ public List getFieldDefinition() { * list of fields * @throws ZinggClientException */ + @Override public void setFieldDefinition(List fieldDefinition) throws ZinggClientException { /*if (fieldDefinition == null || fieldDefinition.size() ==0) throw new ZinggClientException("Missing or incorrect field definitions"); @@ -185,6 +192,7 @@ public void setFieldDefinition(List fieldDefinition) * * @return path to labeled positive sample file */ + @Override public Pipe[] getTrainingSamples() { return trainingSamples; } @@ -196,6 +204,7 @@ public Pipe[] getTrainingSamples() { * path of the matching (positive)labeled sample file * @throws ZinggClientException */ + @Override @JsonSetter public void setTrainingSamples(Pipe[] trainingSamples) throws ZinggClientException { //checkNullBlankEmpty(positiveTrainingSamples, "positive training samples"); @@ -226,10 +235,12 @@ public void setZinggInternal(Pipe[] zinggDir) { + @Override public String getModelId() { return modelId; } + @Override public void setModelId(String modelId) { this.modelId = modelId; } @@ -239,6 +250,7 @@ public void setModelId(String modelId) { * * @return output directory path of the result */ + @Override public Pipe[] getOutput() { return output; } @@ -250,6 +262,7 @@ public Pipe[] getOutput() { * where the match result is saved * @throws ZinggClientException */ + @Override public void setOutput(Pipe[] outputDir) throws ZinggClientException { //checkNullBlankEmpty(outputDir, " path for saving results"); this.output = outputDir; @@ -260,6 +273,7 @@ public void setOutput(Pipe[] outputDir) throws ZinggClientException { * * @return path of data file to be matched */ + @Override public Pipe[] getData() { return this.data; } @@ -272,6 +286,7 @@ public Pipe[] getData() { * /home/zingg/path/to/my/file/to/be/matched.csv * @throws ZinggClientException */ + @Override public void setData(Pipe[] dataFile) throws ZinggClientException { checkNullBlankEmpty(dataFile, "file to be matched"); this.data = dataFile; @@ -312,6 +327,7 @@ public String toString() { * * @return the path for internal Zingg usage */ + @Override public String getZinggDir() { return zinggDir; } @@ -323,6 +339,7 @@ public String getZinggDir() { * @param zinggDir * path to the Zingg directory */ + @Override public void setZinggDir(String zinggDir) { this.zinggDir = zinggDir; } @@ -334,25 +351,30 @@ public void setZinggDir(String zinggDir) { * @return the path for internal Zingg usage */ + @Override @JsonIgnore public String getZinggBaseModelDir(){ return zinggDir + "/" + modelId; } + @Override @JsonIgnore public String getZinggModelDir() { return getZinggBaseModelDir() + "/model"; } + @Override @JsonIgnore public String getZinggDocDir() { return getZinggBaseModelDir() + "/docs/"; } + @Override @JsonIgnore public String getZinggModelDocFile() { return getZinggDocDir() + "/model.html"; } + @Override @JsonIgnore public String getZinggDataDocFile() { return getZinggDocDir() + "/data.html"; @@ -363,6 +385,7 @@ public String getZinggDataDocFile() { * * @return the path for internal Zingg usage */ + @Override @JsonIgnore public String getZinggBaseTrainingDataDir() { return getZinggBaseModelDir() + "/trainingData/"; @@ -375,6 +398,7 @@ public String getZinggBaseTrainingDataDir() { * * @return the path for internal Zingg usage */ + @Override @JsonIgnore public String getZinggTrainingDataUnmarkedDir() { return this.getZinggBaseTrainingDataDir() + "/unmarked/"; @@ -385,6 +409,7 @@ public String getZinggTrainingDataUnmarkedDir() { * * @return the path for internal Zingg usage */ + @Override @JsonIgnore public String getZinggTrainingDataMarkedDir() { return this.getZinggBaseTrainingDataDir() + "/marked/"; @@ -395,6 +420,7 @@ public String getZinggTrainingDataMarkedDir() { * * @return the path for internal Zingg usage */ + @Override @JsonIgnore public String getZinggPreprocessedDataDir() { return zinggDir + "/preprocess"; @@ -406,6 +432,7 @@ public String getZinggPreprocessedDataDir() { * * @return the blockFile */ + @Override @JsonIgnore public String getBlockFile() { return getZinggModelDir() + "/block/zingg.block"; @@ -416,6 +443,7 @@ public String getBlockFile() { * * @return model path */ + @Override @JsonIgnore public String getModel() { return getZinggModelDir() + "/classifier/best.model"; @@ -423,60 +451,73 @@ public String getModel() { + @Override public int getJobId() { return jobId; } + @Override public void setJobId(int jobId) { this.jobId = jobId; } + @Override public boolean getCollectMetrics() { return collectMetrics; } + @Override public void setCollectMetrics(boolean collectMetrics) { this.collectMetrics = collectMetrics; } + @Override public float getStopWordsCutoff() { return stopWordsCutoff; } + @Override public void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException { if (stopWordsCutoff > 1 || stopWordsCutoff < 0) throw new ZinggClientException("Stop words cutoff should be between 0 and 1"); this.stopWordsCutoff = stopWordsCutoff; } + @Override public boolean getShowConcise() { return showConcise; } + @Override public void setShowConcise(boolean showConcise) { this.showConcise = showConcise; } + @Override public String getColumn() { return column; } + @Override public void setColumn(String column) { this.column = column; } + @Override public long getBlockSize() { return blockSize; } + @Override public void setBlockSize(long blockSize){ this.blockSize = blockSize; } + @Override @JsonIgnore public String[] getPipeNames() { Pipe[] input = this.getData(); @@ -488,6 +529,7 @@ public String[] getPipeNames() { return sourceNames; } + @Override @JsonIgnore public String getStopWordsDir() { return getZinggBaseModelDir() + "/stopWords/"; diff --git a/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java b/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java index c20a7bd3a..df3a7c3c5 100644 --- a/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java +++ b/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java @@ -44,7 +44,7 @@ public ArgumentsUtil( Class argsClass) { * @throws ZinggClientException * in case of invalid/wrong json/file not found */ - public Arguments createArgumentsFromJSON(String filePath) + public IArguments createArgumentsFromJSON(String filePath) throws ZinggClientException { return createArgumentsFromJSON(filePath, "match"); } @@ -58,7 +58,7 @@ public Arguments createArgumentsFromJSON(String filePath) * @throws ZinggClientException * in case of invlaid/wrong json/file not found */ - public Arguments createArgumentsFromJSON(String filePath, String phase) + public IArguments createArgumentsFromJSON(String filePath, String phase) throws ZinggClientException { try { ObjectMapper mapper = new ObjectMapper(); @@ -69,7 +69,7 @@ public Arguments createArgumentsFromJSON(String filePath, String phase) module.addDeserializer(List.class, new FieldDefinition.MatchTypeDeserializer()); mapper.registerModule(module); */ - Arguments args = mapper.readValue(new File(filePath), argsClass); + IArguments args = mapper.readValue(new File(filePath), argsClass); LOG.warn("phase is " + phase); checkValid(args, phase); return args; @@ -89,7 +89,7 @@ public Arguments createArgumentsFromJSON(String filePath, String phase) * @throws ZinggClientException * in case there is an error in writing to file */ - public void writeArgumentsToJSON(String filePath, Arguments args) + public void writeArgumentsToJSON(String filePath, IArguments args) throws ZinggClientException { try { ObjectMapper mapper = new ObjectMapper(); @@ -103,7 +103,7 @@ public void writeArgumentsToJSON(String filePath, Arguments args) } } - public void checkValid(Arguments args, String phase) throws ZinggClientException { + public void checkValid(IArguments args, String phase) throws ZinggClientException { if (phase.equals("train") || phase.equals("match") || phase.equals("trainMatch") || phase.equals("link")) { checkIsValid(args); } @@ -115,13 +115,13 @@ else if (!phase.equalsIgnoreCase("WEB")){ } } - public Arguments createArgumentsFromJSONString(String data, String phase) + public IArguments createArgumentsFromJSONString(String data, String phase) throws ZinggClientException { try { ObjectMapper mapper = new ObjectMapper(); mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, true); - Arguments args = mapper.readValue(data, argsClass); + IArguments args = mapper.readValue(data, argsClass); LOG.warn("phase is " + phase); checkValid(args, phase); return args; @@ -132,7 +132,7 @@ public Arguments createArgumentsFromJSONString(String data, String phase) } } - public Arguments createArgumentsFromJSONTemplate(String filePath, String phase) + public IArguments createArgumentsFromJSONTemplate(String filePath, String phase) throws ZinggClientException { try { LOG.warn("Config Argument is " + filePath); @@ -140,7 +140,7 @@ public Arguments createArgumentsFromJSONTemplate(String filePath, String phase) String template = new String(encoded, StandardCharsets.UTF_8); Map env = System.getenv(); String updatedJson = substituteVariables(template, env); - Arguments args = createArgumentsFromJSONString(updatedJson, phase); + IArguments args = createArgumentsFromJSONString(updatedJson, phase); return args; } catch (Exception e) { //e.printStackTrace(); @@ -174,7 +174,7 @@ public String substituteVariables(String template, Map variables return buffer.toString(); } - public void writeArgumentstoJSON(String filePath, Arguments args) throws ZinggClientException { + public void writeArgumentstoJSON(String filePath, IArguments args) throws ZinggClientException { try{ ObjectMapper mapper = new ObjectMapper(); mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, @@ -188,7 +188,7 @@ public void writeArgumentstoJSON(String filePath, Arguments args) throws ZinggCl } } - public String writeArgumentstoJSONString(Arguments args) throws ZinggClientException { + public String writeArgumentstoJSONString(IArguments args) throws ZinggClientException { try{ ObjectMapper mapper = new ObjectMapper(); mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, @@ -207,23 +207,23 @@ public String writeArgumentstoJSONString(Arguments args) throws ZinggClientExcep * @param args * @throws ZinggClientException */ - public void checkIsValid(Arguments args) throws ZinggClientException { - Arguments arg = new Arguments(); + public void checkIsValid(IArguments args) throws ZinggClientException { + IArguments arg = new Arguments(); arg.setTrainingSamples(args.getTrainingSamples()); arg.setData(args.getData()); arg.setNumPartitions(args.getNumPartitions()); arg.setFieldDefinition(args.getFieldDefinition()); } - public void checkIsValidForOthers(Arguments args) throws ZinggClientException { - Arguments arg = new Arguments(); + public void checkIsValidForOthers(IArguments args) throws ZinggClientException { + IArguments arg = new Arguments(); arg.setData(args.getData()); arg.setNumPartitions(args.getNumPartitions()); } - public void checkIsValidForLabelling(Arguments args) throws ZinggClientException { - Arguments arg = new Arguments(); + public void checkIsValidForLabelling(IArguments args) throws ZinggClientException { + IArguments arg = new Arguments(); //arg.setPositiveTrainingSamples(args.getPositiveTrainingSamples()); //arg.setNegativeTrainingSamples(args.getNegativeTrainingSamples()); arg.setData(args.getData()); diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index e98f125dc..29e1b41ab 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -17,7 +17,7 @@ */ public abstract class Client implements Serializable { private static final long serialVersionUID = 1L; - protected Arguments arguments; + protected IArguments arguments; protected ArgumentsUtil argsUtil; protected IZingg zingg; protected ClientOptions options; @@ -38,7 +38,7 @@ public abstract class Client implements Serializable { public Client() {} - public Client(Arguments args, ClientOptions options) throws ZinggClientException { + public Client(IArguments args, ClientOptions options) throws ZinggClientException { setOptions(options); try { buildAndSetArguments(args, options); @@ -51,7 +51,7 @@ public Client(Arguments args, ClientOptions options) throws ZinggClientException } } - public Client(Arguments args, ClientOptions options, S s) throws ZinggClientException { + public Client(IArguments args, ClientOptions options, S s) throws ZinggClientException { this(args, options); this.session = s; LOG.debug("Session passed is " + s); @@ -62,7 +62,7 @@ public Client(Arguments args, ClientOptions options, S s) throws ZinggClientExce - public void setZingg(Arguments args, ClientOptions options) throws Exception{ + public void setZingg(IArguments args, ClientOptions options) throws Exception{ IZinggFactory zf = getZinggFactory(); try{ setZingg(zf.get(ZinggOptions.getByValue(options.get(ClientOptions.PHASE).value.trim()))); @@ -78,7 +78,7 @@ public void setZingg(IZingg zingg) { this.zingg = zingg; } - public void buildAndSetArguments(Arguments args, ClientOptions options) { + public void buildAndSetArguments(IArguments args, ClientOptions options) { setOptions(options); int jobId = new Long(System.currentTimeMillis()).intValue(); if (options.get(options.JOBID)!= null) { @@ -154,7 +154,7 @@ public void printAnalyticsBanner(boolean collectMetrics) { } } - public abstract Client getClient(Arguments args, ClientOptions options) throws ZinggClientException; + public abstract Client getClient(IArguments args, ClientOptions options) throws ZinggClientException; public void mainMethod(String... args) { printBanner(); @@ -171,7 +171,7 @@ public void mainMethod(String... args) { } String phase = options.get(ClientOptions.PHASE).value.trim(); ZinggOptions.verifyPhase(phase); - Arguments arguments = null; + IArguments arguments = null; if (options.get(ClientOptions.CONF).value.endsWith("json")) { arguments = getArgsUtil().createArgumentsFromJSON(options.get(ClientOptions.CONF).value, phase); } @@ -242,7 +242,7 @@ public void stop() throws ZinggClientException{ zingg.cleanup(); } - public Arguments getArguments() { + public IArguments getArguments() { return arguments; } @@ -254,7 +254,7 @@ public void postMetrics() throws ZinggClientException { zingg.postMetrics(); } - public void setArguments(Arguments args) { + public void setArguments(IArguments args) { this.arguments = args; } diff --git a/common/client/src/main/java/zingg/common/client/IArguments.java b/common/client/src/main/java/zingg/common/client/IArguments.java new file mode 100644 index 000000000..d0a9849f4 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/IArguments.java @@ -0,0 +1,233 @@ +package zingg.common.client; + +import java.util.List; + +import zingg.common.client.pipe.Pipe; + +public interface IArguments { + + void setThreshold(double threshold); + + int getNumPartitions(); + + void setNumPartitions(int numPartitions) throws ZinggClientException; + + /** + * Sample size to use for seeding labelled data We dont want to run over all + * the data, as we want a quick way to seed some labeled data which we can + * manually edit + * + * @return sample percent as a float between 0 and 1 + */ + + float getLabelDataSampleSize(); + + /** + * Set the fraction of data to be used from complete data set to be used for + * seeding the labelled data Labelling is costly and we want a fast + * approximate way of looking at a small sample of the records and + * identifying expected matches and non matches + * + * @param labelDataSampleSize + * - float between 0 and 1 denoting portion of dataset to use in + * generating seed samples + * @throws ZinggClientException + */ + void setLabelDataSampleSize(float labelDataSampleSize) throws ZinggClientException; + + /** + * get the field definitions associated with this client + * + * @return list of field definitions + */ + List getFieldDefinition(); + + /** + * Set the field definitions consisting of match field indices, types and + * classes + * + * @see FieldDefinition + * @param fieldDefinition + * list of fields + * @throws ZinggClientException + */ + void setFieldDefinition(List fieldDefinition) throws ZinggClientException; + + /** + * Return the path to the positive labeled samples file + * + * @return path to labeled positive sample file + */ + Pipe[] getTrainingSamples(); + + /** + * Set the path to the positive training sample file + * + * @param positiveTrainingSamples + * path of the matching (positive)labeled sample file + * @throws ZinggClientException + */ + void setTrainingSamples(Pipe[] trainingSamples) throws ZinggClientException; + + /** + * Location for internal Zingg use. + * + * @return the path for internal Zingg usage + + public Pipe[] getZinggInternal() { + return zinggInternal; + } + + /** + * Set the location for Zingg to save its internal computations and + * models. Please set it to a place where the program has write access. + * + * @param zinggDir + * path to the Zingg directory + + public void setZinggInternal(Pipe[] zinggDir) { + this.zinggInternal = zinggDir; + } + */ + + String getModelId(); + + void setModelId(String modelId); + + /** + * Get the output directory where the match output will be saved + * + * @return output directory path of the result + */ + Pipe[] getOutput(); + + /** + * Set the output directory where the match result will be saved + * + * @param outputDir + * where the match result is saved + * @throws ZinggClientException + */ + void setOutput(Pipe[] outputDir) throws ZinggClientException; + + /** + * Get the location of the data file over which the match will be run + * + * @return path of data file to be matched + */ + Pipe[] getData(); + + /** + * Set the file path of the file to be matched. + * + * @param dataFile + * - full file path + * /home/zingg/path/to/my/file/to/be/matched.csv + * @throws ZinggClientException + */ + void setData(Pipe[] dataFile) throws ZinggClientException; + + /** + * Location for internal Zingg use. + * + * @return the path for internal Zingg usage + */ + String getZinggDir(); + + /** + * Set the location for Zingg to save its internal computations and + * models. Please set it to a place where the program has write access. + * + * @param zinggDir + * path to the Zingg directory + */ + void setZinggDir(String zinggDir); + + /** + * Location for internal Zingg use. + * + * @return the path for internal Zingg usage + */ + + String getZinggBaseModelDir(); + + String getZinggModelDir(); + + String getZinggDocDir(); + + String getZinggModelDocFile(); + + String getZinggDataDocFile(); + + /** + * Location for internal Zingg use. + * + * @return the path for internal Zingg usage + */ + String getZinggBaseTrainingDataDir(); + + /** + * Location for internal Zingg use. + * + * @return the path for internal Zingg usage + */ + String getZinggTrainingDataUnmarkedDir(); + + /** + * Location for internal Zingg use. + * + * @return the path for internal Zingg usage + */ + String getZinggTrainingDataMarkedDir(); + + /** + * Location for internal Zingg use. + * + * @return the path for internal Zingg usage + */ + String getZinggPreprocessedDataDir(); + + /** + * This is an internal block file location Not to be used directly by the + * client + * + * @return the blockFile + */ + String getBlockFile(); + + /** + * This is the internal model location Not to be used by the client + * + * @return model path + */ + String getModel(); + + int getJobId(); + + void setJobId(int jobId); + + boolean getCollectMetrics(); + + void setCollectMetrics(boolean collectMetrics); + + float getStopWordsCutoff(); + + void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException; + + boolean getShowConcise(); + + void setShowConcise(boolean showConcise); + + String getColumn(); + + void setColumn(String column); + + long getBlockSize(); + + void setBlockSize(long blockSize); + + String[] getPipeNames(); + + String getStopWordsDir(); + +} \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java b/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java index d409cb566..89e2ae44f 100644 --- a/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java +++ b/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java @@ -8,7 +8,7 @@ public interface ILabelDataViewHelper { List getClusterIds(ZFrame lines); - List getDisplayColumns(ZFrame lines, Arguments args); + List getDisplayColumns(ZFrame lines, IArguments args); ZFrame getCurrentPair(ZFrame lines, int index, List clusterIds, ZFrame clusterLines); diff --git a/common/client/src/main/java/zingg/common/client/ITrainingDataModel.java b/common/client/src/main/java/zingg/common/client/ITrainingDataModel.java index a37452397..fe2120b07 100644 --- a/common/client/src/main/java/zingg/common/client/ITrainingDataModel.java +++ b/common/client/src/main/java/zingg/common/client/ITrainingDataModel.java @@ -10,9 +10,9 @@ public interface ITrainingDataModel { public void updateLabellerStat(int selected_option, int increment); - public void writeLabelledOutput(ZFrame records, Arguments args) throws ZinggClientException; + public void writeLabelledOutput(ZFrame records, IArguments args) throws ZinggClientException; - public void writeLabelledOutput(ZFrame records, Arguments args, Pipe p) throws ZinggClientException; + public void writeLabelledOutput(ZFrame records, IArguments args, Pipe p) throws ZinggClientException; public long getPositivePairsCount(); diff --git a/common/client/src/main/java/zingg/common/client/IZingg.java b/common/client/src/main/java/zingg/common/client/IZingg.java index 306f4bec8..5e77a04db 100644 --- a/common/client/src/main/java/zingg/common/client/IZingg.java +++ b/common/client/src/main/java/zingg/common/client/IZingg.java @@ -4,7 +4,7 @@ public interface IZingg { - public void init(Arguments args, IZinggLicense license) + public void init(IArguments args, IZinggLicense license) throws ZinggClientException; public void execute() throws ZinggClientException; diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java index f84dac788..231464c44 100644 --- a/common/client/src/test/java/zingg/common/client/TestArguments.java +++ b/common/client/src/test/java/zingg/common/client/TestArguments.java @@ -37,7 +37,7 @@ public void testSubstituteVariablesWithAllEnvVarSet() { .readAllBytes(Paths.get(getClass().getResource("../../../testArguments/testConfigTemplate.json.env").getFile())); String template = new String(encoded, StandardCharsets.UTF_8); String json = argsUtil.substituteVariables(template, env); - Arguments args = argsUtil.createArgumentsFromJSONString(json, ""); + IArguments args = argsUtil.createArgumentsFromJSONString(json, ""); assertEquals(args.getData()[0].getProps().get(KEY_HEADER), env.get(KEY_HEADER)); assertEquals(args.getData()[0].getFormat(), env.get(KEY_FORMAT)); @@ -59,7 +59,7 @@ public void testSubstituteVariablesWithMissingEnvVar() { String template = new String(encoded, StandardCharsets.UTF_8); String json = argsUtil.substituteVariables(template, env); - Arguments args = argsUtil.createArgumentsFromJSONString(json, ""); + IArguments args = argsUtil.createArgumentsFromJSONString(json, ""); fail("Exception was expected due to missing environment variable"); } catch (IOException | ZinggClientException e) { LOG.warn("Expected exception received due to missing environment variable"); @@ -79,7 +79,7 @@ public void testSubstituteVariablesWithBlankEnvVar() { String template = new String(encoded, StandardCharsets.UTF_8); String json = argsUtil.substituteVariables(template, env); - Arguments args = argsUtil.createArgumentsFromJSONString(json, ""); + IArguments args = argsUtil.createArgumentsFromJSONString(json, ""); fail("Exception was expected for blank value for an environment variable"); } catch (IOException | ZinggClientException e) { @@ -122,7 +122,7 @@ public void testBooleanType() { String template = new String(encoded, StandardCharsets.UTF_8); String json = argsUtil.substituteVariables(template, env); - Arguments args = argsUtil.createArgumentsFromJSONString(json, ""); + IArguments args = argsUtil.createArgumentsFromJSONString(json, ""); assertEquals(args.getOutput()[0].getProps().get(KEY_HEADER), env.get(KEY_HEADER)); } catch (IOException | ZinggClientException e) { @@ -166,7 +166,7 @@ public void testNumericWithinQuotes() { String template = new String(encoded, StandardCharsets.UTF_8); String json = argsUtil.substituteVariables(template, env); - Arguments args = argsUtil.createArgumentsFromJSONString(json, ""); + IArguments args = argsUtil.createArgumentsFromJSONString(json, ""); //Numeric within quotes are allowed assertEquals(args.getModelId(), env.get(KEY_MODEL_ID)); @@ -189,7 +189,7 @@ public void testMalformedVariable() { String template = new String(encoded, StandardCharsets.UTF_8); String json = argsUtil.substituteVariables(template, env); - Arguments args = argsUtil.createArgumentsFromJSONString(json, ""); + IArguments args = argsUtil.createArgumentsFromJSONString(json, ""); fail("Exception was expected for malformed variable in json"); } catch (IOException | ZinggClientException e) { @@ -210,7 +210,7 @@ public void testInvalidFilePath() { @Test public void testMatchTypeMultiple() { - Arguments args; + IArguments args; try { args = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configWithMultipleMatchTypes.json").getFile(), "test"); List fNameMatchType = args.getFieldDefinition().get(0).getMatchType(); @@ -229,7 +229,7 @@ public void testMatchTypeMultiple() { @Test public void testMatchTypeWrong() { - Arguments args; + IArguments args; try { args = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configWithMultipleMatchTypesUnsupported.json").getFile(), "test"); //List fNameMatchType = args.getFieldDefinition().get(0).getMatchType(); diff --git a/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java index 73d29141e..c227f5187 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/DataColDocumenter.java @@ -3,7 +3,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.core.Context; @@ -12,7 +12,7 @@ public abstract class DataColDocumenter extends DocumenterBase context, Arguments args) { + public DataColDocumenter(Context context, IArguments args) { super(context, args); } diff --git a/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java index f3dea702c..71737064d 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/DataDocumenter.java @@ -8,8 +8,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; import zingg.common.client.FieldData; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.core.Context; @@ -22,7 +22,7 @@ public abstract class DataDocumenter extends DocumenterBase data; - public DataDocumenter(Context context, Arguments args) { + public DataDocumenter(Context context, IArguments args) { super(context, args); data = getDSUtil().emptyDataFrame(); } diff --git a/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java b/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java index 8abba2c70..0f891c839 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java +++ b/common/core/src/main/java/zingg/common/core/documenter/DocumenterBase.java @@ -9,7 +9,7 @@ import freemarker.template.Template; import freemarker.template.TemplateExceptionHandler; import freemarker.template.Version; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.common.core.Context; @@ -19,7 +19,7 @@ public abstract class DocumenterBase extends ZinggBase{ private static final long serialVersionUID = 1L; protected static Configuration config; - public DocumenterBase(Context context, Arguments args) { + public DocumenterBase(Context context, IArguments args) { super.context = context; super.args = args; config = createConfigurationObject(); diff --git a/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java index f443cdc5c..41d215e63 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/ModelColDocumenter.java @@ -6,7 +6,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.core.Context; @@ -18,7 +18,7 @@ public abstract class ModelColDocumenter extends DocumenterBase context, Arguments args) { + public ModelColDocumenter(Context context, IArguments args) { super(context, args); } diff --git a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java index 1782bd960..75363e71c 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java @@ -10,7 +10,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; @@ -31,7 +31,7 @@ public abstract class ModelDocumenter extends DocumenterBase markedRecords; protected ZFrame unmarkedRecords; - public ModelDocumenter(Context context, Arguments args) { + public ModelDocumenter(Context context, IArguments args) { super(context, args); markedRecords = getDSUtil().emptyDataFrame(); } diff --git a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java index ef21600e3..e4e43109a 100644 --- a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java @@ -3,7 +3,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -21,7 +21,7 @@ public FindAndLabeller() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { finder.init(args, license); labeller.init(args, license); super.init(args, license); diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java index 14273ba2c..0c6024621 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java @@ -5,8 +5,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; import zingg.common.client.ClientOptions; +import zingg.common.client.IArguments; import zingg.common.client.ILabelDataViewHelper; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; @@ -41,7 +41,7 @@ public List getClusterIds(ZFrame lines) { @Override - public List getDisplayColumns(ZFrame lines, Arguments args) { + public List getDisplayColumns(ZFrame lines, IArguments args) { return getDSUtil().getFieldDefColumns(lines, args, false, args.getShowConcise()); } diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java index 162c033f9..e6521ec21 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java @@ -2,7 +2,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; + +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -21,7 +22,7 @@ public TrainMatcher() { } @Override - public void init(Arguments args, IZinggLicense license) + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { trainer.init(args, license); matcher.init(args, license); diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java index 4e6424789..c11f75fc0 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataModel.java @@ -3,8 +3,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; import zingg.common.client.ClientOptions; +import zingg.common.client.IArguments; import zingg.common.client.ITrainingDataModel; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; @@ -68,13 +68,13 @@ else if (selected_option == ColValues.MATCH_TYPE_NOT_SURE) { } - public void writeLabelledOutput(ZFrame records, Arguments args) throws ZinggClientException { + public void writeLabelledOutput(ZFrame records, IArguments args) throws ZinggClientException { Pipe p = getOutputPipe(args); writeLabelledOutput(records,args,p); } - public void writeLabelledOutput(ZFrame records, Arguments args, Pipe p) throws ZinggClientException { + public void writeLabelledOutput(ZFrame records, IArguments args, Pipe p) throws ZinggClientException { if (records == null) { LOG.warn("No labelled records"); return; @@ -82,7 +82,7 @@ public void writeLabelledOutput(ZFrame records, Arguments args, Pipe p) t getPipeUtil().write(records, args,p); } - public Pipe getOutputPipe(Arguments args) { + public Pipe getOutputPipe(IArguments args) { return getPipeUtil().getTrainingDataMarkedPipe(args); } diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index 45466dd77..e4b5b4c39 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -5,8 +5,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; import zingg.common.client.ClientOptions; +import zingg.common.client.IArguments; import zingg.common.client.ILabelDataViewHelper; import zingg.common.client.ITrainingDataModel; import zingg.common.client.IZingg; @@ -30,7 +30,7 @@ public abstract class ZinggBase implements Serializable, IZingg { - protected Arguments args; + protected IArguments args; protected Context context; protected String name; @@ -63,7 +63,7 @@ public ZinggBase() { - public void init(Arguments args, IZinggLicense license) + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { startTime = System.currentTimeMillis(); this.args = args; @@ -89,11 +89,11 @@ public void postMetrics() { Analytics.postEvent(zinggOptions.getValue(), collectMetrics); } - public Arguments getArgs() { + public IArguments getArgs() { return this.args; } - public void setArgs(Arguments args) { + public void setArgs(IArguments args) { this.args = args; } diff --git a/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java b/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java index d50991c85..ea42b7401 100644 --- a/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java +++ b/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java @@ -3,7 +3,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; @@ -27,7 +27,7 @@ public void setPipeUtil(PipeUtilBase pipeUtil) { - public ZFrame preprocessForStopWords(S session, Arguments args, ZFrame ds) throws ZinggClientException { + public ZFrame preprocessForStopWords(S session, IArguments args, ZFrame ds) throws ZinggClientException { /* List wordList = new ArrayList(); for (FieldDefinition def : args.getFieldDefinition()) { diff --git a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java index 12e4343cf..b45c6d250 100644 --- a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java +++ b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java @@ -8,8 +8,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; @@ -24,9 +24,9 @@ public abstract class StopWordsRemover implements Serializable{ protected static final int COLUMN_INDEX_DEFAULT = 0; protected Context context; - protected Arguments args; + protected IArguments args; - public StopWordsRemover(Context context,Arguments args) { + public StopWordsRemover(Context context,IArguments args) { super(); this.context = context; this.args = args; @@ -90,11 +90,11 @@ public void setContext(Context context) { this.context = context; } - public Arguments getArgs() { + public IArguments getArgs() { return args; } - public void setArgs(Arguments args) { + public void setArgs(IArguments args) { this.args = args; } diff --git a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java index 6105f63a0..9e29c6ba7 100644 --- a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java +++ b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java @@ -5,7 +5,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggClientException; @@ -17,9 +17,9 @@ public abstract class StopWordsRecommender { public static final Log LOG = LogFactory.getLog(StopWordsRecommender.class); protected Context context; protected ZFrame data; - public Arguments args; + public IArguments args; - public StopWordsRecommender(Context context,Arguments args) { + public StopWordsRecommender(Context context,IArguments args) { this.context = context; this.args = args; } diff --git a/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java b/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java index 959fbfa7a..d2bb54eb9 100644 --- a/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java @@ -6,8 +6,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; import zingg.common.client.ZFrame; @@ -43,7 +43,7 @@ public abstract Block getBlock(ZFrame sample, ZFrame posi public Tree> createBlockingTree(ZFrame testData, ZFrame positives, double sampleFraction, long blockSize, - Arguments args, + IArguments args, ListMap> hashFunctions) throws Exception, ZinggClientException { ZFrame sample = testData.sample(false, sampleFraction); sample = sample.cache(); @@ -81,13 +81,13 @@ public Tree> createBlockingTree(ZFrame testData, public Tree> createBlockingTreeFromSample(ZFrame testData, - ZFrame positives, double sampleFraction, long blockSize, Arguments args, + ZFrame positives, double sampleFraction, long blockSize, IArguments args, ListMap hashFunctions) throws Exception, ZinggClientException { ZFrame sample = testData.sample(false, sampleFraction); return createBlockingTree(sample, positives, sampleFraction, blockSize, args, hashFunctions); } - public void writeBlockingTree(Tree> blockingTree, Arguments args) throws Exception, ZinggClientException { + public void writeBlockingTree(Tree> blockingTree, IArguments args) throws Exception, ZinggClientException { byte[] byteArray = Util.convertObjectIntoByteArray(blockingTree); PipeUtilBase pu = getPipeUtil(); pu.write(getTreeDF(byteArray), args, pu.getBlockingTreePipe(args)); @@ -102,7 +102,7 @@ public byte[] getTreeFromDF(ZFrame z){ } - public Tree> readBlockingTree(Arguments args) throws Exception, ZinggClientException{ + public Tree> readBlockingTree(IArguments args) throws Exception, ZinggClientException{ PipeUtilBase pu = getPipeUtil(); ZFrame tree = pu.read(false, 1, false, pu.getBlockingTreePipe(args)); //tree.show(); diff --git a/common/core/src/main/java/zingg/common/core/util/DSUtil.java b/common/core/src/main/java/zingg/common/core/util/DSUtil.java index 79a091b7f..d0a24ebb1 100644 --- a/common/core/src/main/java/zingg/common/core/util/DSUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/DSUtil.java @@ -1,8 +1,8 @@ package zingg.common.core.util; -import zingg.common.client.Arguments; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; @@ -108,7 +108,7 @@ public ZFrame joinWithItself(ZFrame lines, String joinColumn, return join(lines, lines1, joinColumn, filter); } - public ZFrame joinWithItselfSourceSensitive(ZFrame lines, String joinColumn, Arguments args) throws Exception { + public ZFrame joinWithItselfSourceSensitive(ZFrame lines, String joinColumn, IArguments args) throws Exception { ZFrame lines1 = getPrefixedColumnsDS(lines).cache(); @@ -118,7 +118,7 @@ public ZFrame joinWithItselfSourceSensitive(ZFrame lines, Str return join(lines, lines1, joinColumn, false); } - public ZFrame alignLinked(ZFrame dupesActual, Arguments args) { + public ZFrame alignLinked(ZFrame dupesActual, IArguments args) { dupesActual = dupesActual.cache(); List cols = new ArrayList(); @@ -153,7 +153,7 @@ public ZFrame alignLinked(ZFrame dupesActual, Arguments args) return dupes1; } - public ZFrame alignDupes(ZFrame dupesActual, Arguments args) { + public ZFrame alignDupes(ZFrame dupesActual, IArguments args) { dupesActual = dupesActual.cache(); List cols = new ArrayList(); @@ -191,7 +191,7 @@ public ZFrame alignDupes(ZFrame dupesActual, Arguments args) return dupes1; } - public ZFrame allFieldsEqual(ZFrame a, Arguments args) { + public ZFrame allFieldsEqual(ZFrame a, IArguments args) { for (FieldDefinition def : args.getFieldDefinition()) { if (! (def.getMatchType() == null || def.getMatchType().contains(MatchType.DONT_USE))) { //columns.add(def.getFieldName()); @@ -204,7 +204,7 @@ public ZFrame allFieldsEqual(ZFrame a, Arguments args) { } - public List getFieldDefColumns (ZFrame ds, Arguments args, boolean includeZid, boolean showConcise) { + public List getFieldDefColumns (ZFrame ds, IArguments args, boolean includeZid, boolean showConcise) { List cols = new ArrayList(); if (includeZid) { cols.add(ds.col(ColName.ID_COL)); @@ -220,7 +220,7 @@ public List getFieldDefColumns (ZFrame ds, Arguments args, boolean } - public ZFrame getFieldDefColumnsDS(ZFrame ds, Arguments args, boolean includeZid) { + public ZFrame getFieldDefColumnsDS(ZFrame ds, IArguments args, boolean includeZid) { return select(ds, getFieldDefColumns(ds, args, includeZid, false)); } @@ -228,7 +228,7 @@ public ZFrame select(ZFrame ds, List cols) { return ds.select(cols); } - public ZFrame dropDuplicates(ZFrame a, Arguments args) { + public ZFrame dropDuplicates(ZFrame a, IArguments args) { LOG.info("duplicates before " + a.count()); List cols = new ArrayList(); for (FieldDefinition def : args.getFieldDefinition()) { @@ -243,11 +243,11 @@ public ZFrame dropDuplicates(ZFrame a, Arguments args) { return a; } - public ZFrame getTraining(PipeUtilBase pipeUtil, Arguments args) { + public ZFrame getTraining(PipeUtilBase pipeUtil, IArguments args) { return getTraining(pipeUtil, args, pipeUtil.getTrainingDataMarkedPipe(args)); } - private ZFrame getTraining(PipeUtilBase pipeUtil, Arguments args, Pipe p) { + private ZFrame getTraining(PipeUtilBase pipeUtil, IArguments args, Pipe p) { ZFrame trFile = null; try{ trFile = pipeUtil.read(false, false, p); @@ -271,7 +271,7 @@ private ZFrame getTraining(PipeUtilBase pipeUtil, Argument return trFile; } - public List getFieldDefinitionFiltered(Arguments args, MatchType type) { + public List getFieldDefinitionFiltered(IArguments args, MatchType type) { return args.getFieldDefinition() .stream() .filter(f -> !(f.getMatchType() == null || f.getMatchType().contains(type))) diff --git a/common/core/src/main/java/zingg/common/core/util/ModelUtil.java b/common/core/src/main/java/zingg/common/core/util/ModelUtil.java index c3b61439c..ed8d0951a 100644 --- a/common/core/src/main/java/zingg/common/core/util/ModelUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/ModelUtil.java @@ -5,8 +5,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; @@ -25,7 +25,7 @@ public abstract class ModelUtil { public abstract FeatureFactory getFeatureFactory(); - public void loadFeatures(Arguments args) throws ZinggClientException { + public void loadFeatures(IArguments args) throws ZinggClientException { try{ LOG.info("Start reading internal configurations and functions"); if (args.getFieldDefinition() != null) { @@ -48,7 +48,7 @@ public void loadFeatures(Arguments args) throws ZinggClientException { } } - public Map> getFeaturers(Arguments args) throws ZinggClientException { + public Map> getFeaturers(IArguments args) throws ZinggClientException { if (this.featurers == null) loadFeatures(args); return this.featurers; } @@ -58,7 +58,7 @@ public void setFeaturers(Map> featurers) { } public Model createModel(ZFrame positives, - ZFrame negatives, boolean isLabel, Arguments args) throws Exception, ZinggClientException { + ZFrame negatives, boolean isLabel, IArguments args) throws Exception, ZinggClientException { LOG.info("Learning similarity rules"); ZFrame posLabeledPointsWithLabel = positives.withColumn(ColName.MATCH_FLAG_COL, ColValues.MATCH_TYPE_MATCH); posLabeledPointsWithLabel = posLabeledPointsWithLabel.cache(); @@ -77,9 +77,9 @@ public Model createModel(ZFrame positives, return model; } - public abstract Model getModel(boolean isLabel, Arguments args) throws ZinggClientException; + public abstract Model getModel(boolean isLabel, IArguments args) throws ZinggClientException; - public abstract Model loadModel(boolean isLabel, Arguments args) throws ZinggClientException; + public abstract Model loadModel(boolean isLabel, IArguments args) throws ZinggClientException; diff --git a/common/core/src/main/java/zingg/common/core/util/PipeUtil.java b/common/core/src/main/java/zingg/common/core/util/PipeUtil.java index e1cb7b773..b76f8a371 100644 --- a/common/core/src/main/java/zingg/common/core/util/PipeUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/PipeUtil.java @@ -6,7 +6,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.pipe.FilePipe; @@ -185,7 +185,7 @@ public ZFrame read(boolean addExtraCol, boolean addLineNo, int numPartit return rows; } - public void write(ZFrame toWriteOrig, Arguments args, + public void write(ZFrame toWriteOrig, IArguments args, Pipe... pipes) throws ZinggClientException { try { for (Pipe p: pipes) { @@ -310,21 +310,21 @@ public void writePerSource(Dataset toWrite, Arguments args, JavaSparkConte } */ - public Pipe getTrainingDataUnmarkedPipe(Arguments args) { + public Pipe getTrainingDataUnmarkedPipe(IArguments args) { Pipe p = new Pipe(); p.setFormat(Pipe.FORMAT_PARQUET); p.setProp(FilePipe.LOCATION, args.getZinggTrainingDataUnmarkedDir()); return p; } - public Pipe getTrainingDataMarkedPipe(Arguments args) { + public Pipe getTrainingDataMarkedPipe(IArguments args) { Pipe p = new Pipe(); p.setFormat(Pipe.FORMAT_PARQUET); p.setProp(FilePipe.LOCATION, args.getZinggTrainingDataMarkedDir()); return p; } - public Pipe getModelDocumentationPipe(Arguments args) { + public Pipe getModelDocumentationPipe(IArguments args) { Pipe p = new Pipe(); p.setFormat(Pipe.FORMAT_TEXT); p.setProp(FilePipe.LOCATION, args.getZinggModelDocFile()); @@ -334,7 +334,7 @@ public Pipe getModelDocumentationPipe(Arguments args) { - public Pipe getStopWordsPipe(Arguments args, String fileName) { + public Pipe getStopWordsPipe(IArguments args, String fileName) { Pipe p = new Pipe(); p.setFormat(Pipe.FORMAT_CSV); p.setProp(FilePipe.HEADER, "true"); @@ -344,7 +344,7 @@ public Pipe getStopWordsPipe(Arguments args, String fileName) { return p; } - public Pipe getBlockingTreePipe(Arguments args) { + public Pipe getBlockingTreePipe(IArguments args) { Pipe p = new Pipe(); p.setFormat(Pipe.FORMAT_PARQUET); p.setProp(FilePipe.LOCATION, args.getBlockFile()); diff --git a/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java b/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java index 887ddc054..bdb363a2b 100644 --- a/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java +++ b/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java @@ -1,6 +1,6 @@ package zingg.common.core.util; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.pipe.Pipe; @@ -29,19 +29,19 @@ public ZFrame read(boolean addLineNo, int numPartitions, public ZFrame read(boolean addExtraCol, boolean addLineNo, int numPartitions, boolean addSource, Pipe... pipes) throws ZinggClientException; - public void write(ZFrame toWriteOrig, Arguments args, Pipe... pipes) + public void write(ZFrame toWriteOrig, IArguments args, Pipe... pipes) throws ZinggClientException; - public Pipe getTrainingDataUnmarkedPipe(Arguments args); + public Pipe getTrainingDataUnmarkedPipe(IArguments args); - public Pipe getTrainingDataMarkedPipe(Arguments args); + public Pipe getTrainingDataMarkedPipe(IArguments args); - public Pipe getModelDocumentationPipe(Arguments args); + public Pipe getModelDocumentationPipe(IArguments args); - public Pipe getBlockingTreePipe(Arguments args); + public Pipe getBlockingTreePipe(IArguments args); - public Pipe getStopWordsPipe(Arguments args, String string); + public Pipe getStopWordsPipe(IArguments args, String string); public String getPipesAsString(Pipe[] pipes); diff --git a/spark/client/src/main/java/zingg/spark/client/SparkClient.java b/spark/client/src/main/java/zingg/spark/client/SparkClient.java index fda95945f..211ea279f 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkClient.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkClient.java @@ -8,9 +8,9 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.SparkSession; -import zingg.common.client.Arguments; import zingg.common.client.Client; import zingg.common.client.ClientOptions; +import zingg.common.client.IArguments; import zingg.common.client.IZinggFactory; import zingg.common.client.ZinggClientException; import zingg.common.client.license.IZinggLicense; @@ -25,16 +25,16 @@ public class SparkClient extends Client, Row, Column private static final long serialVersionUID = 1L; - public SparkClient(Arguments args, ClientOptions options) throws ZinggClientException { + public SparkClient(IArguments args, ClientOptions options) throws ZinggClientException { super(args, options); } - public SparkClient(Arguments args, ClientOptions options, ZSparkSession s) throws ZinggClientException { + public SparkClient(IArguments args, ClientOptions options, ZSparkSession s) throws ZinggClientException { super(args, options, s); } - public SparkClient(Arguments args, ClientOptions options, SparkSession s) throws ZinggClientException { + public SparkClient(IArguments args, ClientOptions options, SparkSession s) throws ZinggClientException { this(args, options, new ZSparkSession(s,null)); } @@ -56,7 +56,7 @@ public IZinggFactory getZinggFactory() throws InstantiationException, IllegalAcc @Override - public Client, Row, Column, DataType> getClient(Arguments args, + public Client, Row, Column, DataType> getClient(IArguments args, ClientOptions options) throws ZinggClientException { // TODO Auto-generated method stub SparkClient client = null; diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrameBase.java b/spark/client/src/test/java/zingg/client/TestSparkFrameBase.java index 6bd4ec3f0..dcc75bd95 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrameBase.java @@ -20,13 +20,14 @@ import org.junit.jupiter.api.BeforeAll; import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; import zingg.spark.client.SparkFrame; public class TestSparkFrameBase { - public static Arguments args; + public static IArguments args; public static JavaSparkContext ctx; public static SparkSession spark; diff --git a/spark/client/src/test/java/zingg/spark/client/TestArguments.java b/spark/client/src/test/java/zingg/spark/client/TestArguments.java index 210fc0c16..79fb61aaf 100644 --- a/spark/client/src/test/java/zingg/spark/client/TestArguments.java +++ b/spark/client/src/test/java/zingg/spark/client/TestArguments.java @@ -11,6 +11,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ArgumentsUtil; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; import zingg.common.client.pipe.Pipe; @@ -22,7 +23,7 @@ public class TestArguments { protected ArgumentsUtil argsUtil = new ArgumentsUtil(); @Test public void testWriteArgumentObjectToJSONFile() { - Arguments args = new Arguments(); + IArguments args = new Arguments(); try { FieldDefinition fname = new FieldDefinition(); fname.setFieldName("fname"); @@ -55,7 +56,7 @@ public void testWriteArgumentObjectToJSONFile() { argsUtil.writeArgumentsToJSON("/tmp/configFromArgObject.json", args); //reload the same config file to check if deserialization is successful - Arguments newArgs = argsUtil.createArgumentsFromJSON("/tmp/configFromArgObject.json", "test"); + IArguments newArgs = argsUtil.createArgumentsFromJSON("/tmp/configFromArgObject.json", "test"); assertEquals(newArgs.getModelId(), "500", "Model id is different"); assertEquals(newArgs.getBlockSize(), 400L, "Block size is different"); assertEquals(newArgs.getFieldDefinition().get(0).getFieldName(), "fname", "Field Definition[0]'s name is different"); diff --git a/spark/client/src/test/java/zingg/spark/client/TestSparkClient.java b/spark/client/src/test/java/zingg/spark/client/TestSparkClient.java index 4cbffc444..df404f9c9 100644 --- a/spark/client/src/test/java/zingg/spark/client/TestSparkClient.java +++ b/spark/client/src/test/java/zingg/spark/client/TestSparkClient.java @@ -7,12 +7,13 @@ import zingg.common.client.Arguments; import zingg.common.client.Client; import zingg.common.client.ClientOptions; +import zingg.common.client.IArguments; public class TestSparkClient { @Test public void testSetColumnOptionThroughBuildAndSetArguments() { - Arguments arguments = new Arguments(); + IArguments arguments = new Arguments(); String[] args = {ClientOptions.CONF, "configFile", ClientOptions.PHASE, "train", ClientOptions.COLUMN, "columnName", ClientOptions.SHOW_CONCISE, "true", ClientOptions.LICENSE, "licenseFile"}; ClientOptions options = new ClientOptions(args); Client client = new SparkClient(); diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java index 09f8e5642..146c51e08 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataColDocumenter.java @@ -6,7 +6,7 @@ import org.apache.spark.sql.types.DataType; import freemarker.template.Version; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.core.Context; import zingg.common.core.documenter.DataColDocumenter; import zingg.common.core.documenter.RowWrapper; @@ -20,7 +20,7 @@ public class SparkDataColDocumenter extends DataColDocumenter, Row, Column,DataType> context, Arguments args) { + public SparkDataColDocumenter(Context, Row, Column,DataType> context, IArguments args) { super(context, args); } diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java index ab8a1f32e..c591b99e4 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkDataDocumenter.java @@ -6,7 +6,7 @@ import org.apache.spark.sql.types.DataType; import freemarker.template.Version; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.core.Context; import zingg.common.core.documenter.DataDocumenter; import zingg.common.core.documenter.RowWrapper; @@ -20,7 +20,7 @@ public class SparkDataDocumenter extends DataDocumenter, Row, Column,DataType> context, Arguments args) { + public SparkDataDocumenter(Context, Row, Column,DataType> context, IArguments args) { super(context, args); } diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java index 5cd49fb61..53b4b1829 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelColDocumenter.java @@ -6,7 +6,7 @@ import org.apache.spark.sql.types.DataType; import freemarker.template.Version; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.core.Context; import zingg.common.core.documenter.ModelColDocumenter; import zingg.common.core.documenter.RowWrapper; @@ -21,7 +21,7 @@ public class SparkModelColDocumenter extends ModelColDocumenter, Row, Column,DataType> context, Arguments args) { + public SparkModelColDocumenter(Context, Row, Column,DataType> context, IArguments args) { super(context, args); } diff --git a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java index 2a210bbb3..70a4f07aa 100644 --- a/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/documenter/SparkModelDocumenter.java @@ -6,7 +6,7 @@ import org.apache.spark.sql.types.DataType; import freemarker.template.Version; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.core.Context; import zingg.common.core.documenter.ModelDocumenter; import zingg.common.core.documenter.RowWrapper; @@ -20,7 +20,7 @@ public class SparkModelDocumenter extends ModelDocumenter, Row, Column,DataType> context, Arguments args) { + public SparkModelDocumenter(Context, Row, Column,DataType> context, IArguments args) { super(context, args); super.modelColDoc = new SparkModelColDocumenter(context,args); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 7b3ab3ec1..2a45904f6 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -31,7 +31,7 @@ public SparkDocumenter() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index e4e0db2b9..3b395f85d 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -8,7 +8,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -30,7 +30,7 @@ public SparkFindAndLabeller() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java index c23e07bd9..bba1779c6 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java @@ -8,7 +8,7 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -35,7 +35,7 @@ public SparkLabelUpdater() { @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index fd38d2e64..90c6d1585 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -35,7 +35,7 @@ public SparkLabeller(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index dde7c3633..3033f0813 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -30,7 +30,7 @@ public SparkLinker() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index ee6b1145c..5bbbe0401 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -8,7 +8,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -40,7 +40,7 @@ public SparkMatcher(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index e12876b04..9cf793ad9 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -11,8 +11,8 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; import zingg.common.client.ClientOptions; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -32,7 +32,7 @@ public SparkPeekModel() { } @Override - public void init(Arguments args, IZinggLicense license) + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().setUtils(); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 541dcd297..a34676143 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -33,7 +33,7 @@ public SparkRecommender() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index 86600939e..b24db07cb 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -29,7 +29,7 @@ public SparkTrainMatcher() { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index 6bad40892..bd2124b4d 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -33,7 +33,7 @@ public SparkTrainer(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index b2813d4f5..9c0816128 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.ZinggOptions; import zingg.common.client.license.IZinggLicense; @@ -32,7 +32,7 @@ public SparkTrainingDataFinder(ZinggSparkContext sparkContext) { } @Override - public void init(Arguments args, IZinggLicense license) throws ZinggClientException { + public void init(IArguments args, IZinggLicense license) throws ZinggClientException { super.init(args, license); getContext().init(license); } diff --git a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java index bc53b4652..d20c3fa38 100644 --- a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java +++ b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java @@ -13,7 +13,7 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataTypes; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.core.Context; import zingg.common.core.preprocess.StopWordsRemover; @@ -28,7 +28,7 @@ public class SparkStopWordsRemover extends StopWordsRemover, Row, Column,DataType> context,Arguments args) { + public SparkStopWordsRemover(Context, Row, Column,DataType> context,IArguments args) { super(context,args); this.udfName = registerUDF(); } diff --git a/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java b/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java index b7fec9a99..32873f035 100644 --- a/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/recommender/SparkStopWordsRecommender.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.core.Context; import zingg.common.core.recommender.StopWordsRecommender; import zingg.spark.client.ZSparkSession; @@ -24,7 +24,7 @@ public class SparkStopWordsRecommender extends StopWordsRecommender, Row, Column,DataType> context,Arguments args) { + public SparkStopWordsRecommender(Context, Row, Column,DataType> context,IArguments args) { super(context,args); } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java index a3a01679e..2ad472636 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkModelUtil.java @@ -6,7 +6,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.core.feature.FeatureFactory; import zingg.common.core.model.Model; @@ -26,7 +26,7 @@ public SparkModelUtil(ZSparkSession s) { this.session = s; } - public Model, Row, Column> getModel(boolean isLabel, Arguments args) throws ZinggClientException{ + public Model, Row, Column> getModel(boolean isLabel, IArguments args) throws ZinggClientException{ Model, Row, Column> model = null; if (isLabel) { model = new SparkLabelModel(getFeaturers(args)); @@ -39,7 +39,7 @@ public Model, Row, Column> getModel(boolean @Override public Model, Row, Column> loadModel(boolean isLabel, - Arguments args) throws ZinggClientException { + IArguments args) throws ZinggClientException { Model, Row, Column> model = getModel(isLabel, args); model.load(args.getModel()); return model; diff --git a/spark/core/src/test/java/zingg/block/TestBlock.java b/spark/core/src/test/java/zingg/block/TestBlock.java index 2066a1718..cdcd09ba9 100644 --- a/spark/core/src/test/java/zingg/block/TestBlock.java +++ b/spark/core/src/test/java/zingg/block/TestBlock.java @@ -16,9 +16,9 @@ import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; -import zingg.common.client.Arguments; import zingg.common.client.ArgumentsUtil; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; @@ -38,7 +38,7 @@ public void testTree() throws Throwable { ZFrame, Row, Column> posDf = getPosData(); - Arguments args = getArguments(); + IArguments args = getArguments(); // form tree SparkBlockingTreeUtil blockingTreeUtil = new SparkBlockingTreeUtil(zSession, zsCTX.getPipeUtil()); @@ -74,10 +74,10 @@ public void testTree() throws Throwable { - private Arguments getArguments() throws ZinggClientException { + private IArguments getArguments() throws ZinggClientException { String configFilePath = getClass().getResource("../../testFebrl/config.json").getFile(); - Arguments args = argsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); + IArguments args = argsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); List fdList = getFieldDefList(); diff --git a/spark/core/src/test/java/zingg/common/core/documenter/TestDataDocumenter.java b/spark/core/src/test/java/zingg/common/core/documenter/TestDataDocumenter.java index cd176a4af..762096cdf 100644 --- a/spark/core/src/test/java/zingg/common/core/documenter/TestDataDocumenter.java +++ b/spark/core/src/test/java/zingg/common/core/documenter/TestDataDocumenter.java @@ -14,6 +14,7 @@ import org.junit.jupiter.api.Test; import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.pipe.FilePipe; import zingg.common.client.pipe.Pipe; import zingg.spark.core.documenter.SparkDataDocumenter; @@ -22,7 +23,7 @@ public class TestDataDocumenter extends ZinggSparkTester { public static final Log LOG = LogFactory.getLog(TestDataDocumenter.class); - private Arguments docArguments = new Arguments(); + private IArguments docArguments = new Arguments(); @BeforeEach public void setUp(){ try { diff --git a/spark/core/src/test/java/zingg/common/core/documenter/TestModelDocumenter.java b/spark/core/src/test/java/zingg/common/core/documenter/TestModelDocumenter.java index daf30bcf4..39d8f9db7 100644 --- a/spark/core/src/test/java/zingg/common/core/documenter/TestModelDocumenter.java +++ b/spark/core/src/test/java/zingg/common/core/documenter/TestModelDocumenter.java @@ -17,6 +17,7 @@ import org.junit.jupiter.api.Test; import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.util.ColName; import zingg.spark.client.SparkFrame; import zingg.spark.core.documenter.SparkModelDocumenter; @@ -25,7 +26,7 @@ public class TestModelDocumenter extends ZinggSparkTester { public static final Log LOG = LogFactory.getLog(TestModelDocumenter.class); - Arguments docArguments = new Arguments(); + IArguments docArguments = new Arguments(); @BeforeEach public void setUp(){ diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java index 7df457474..c9ace5f3f 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java @@ -20,6 +20,7 @@ import zingg.common.client.Arguments; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; @@ -68,7 +69,7 @@ public void testStopWordsSingleColumn() throws ZinggClientException { eventFD.setMatchType(matchTypelistFuzzy); fdList.add(eventFD); - Arguments stmtArgs = new Arguments(); + IArguments stmtArgs = new Arguments(); stmtArgs.setFieldDefinition(fdList); StopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,stmtArgs); diff --git a/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java b/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java index a1683d7d6..5aae8e235 100644 --- a/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java +++ b/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java @@ -20,6 +20,7 @@ import zingg.common.client.Arguments; import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; @@ -54,7 +55,7 @@ public void testGetFieldDefColumnsWhenShowConciseIsTrue() throws ZinggClientExce fieldDef.add(def1); fieldDef.add(def2); fieldDef.add(def3); - Arguments args = null; + IArguments args = null; try { args = new Arguments(); args.setFieldDefinition(fieldDef); @@ -105,7 +106,7 @@ public void testGetFieldDefColumnsWhenShowConciseIsFalse() throws ZinggClientExc fieldDef.add(def1); fieldDef.add(def2); fieldDef.add(def3); - Arguments args = null; + IArguments args = null; try { args = new Arguments(); args.setFieldDefinition(fieldDef); diff --git a/spark/core/src/test/java/zingg/common/core/util/TestPipeUtil.java b/spark/core/src/test/java/zingg/common/core/util/TestPipeUtil.java index 087e1e6ca..974cc3e49 100644 --- a/spark/core/src/test/java/zingg/common/core/util/TestPipeUtil.java +++ b/spark/core/src/test/java/zingg/common/core/util/TestPipeUtil.java @@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test; import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.pipe.FilePipe; import zingg.common.client.pipe.Pipe; import zingg.spark.core.executor.ZinggSparkTester; @@ -17,7 +18,7 @@ public class TestPipeUtil extends ZinggSparkTester{ @Test public void testStopWordsPipe() { - Arguments args = new Arguments(); + IArguments args = new Arguments(); String fileName = args.getStopWordsDir() + "file"; Pipe p = zsCTX.getPipeUtil().getStopWordsPipe(args, fileName); diff --git a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java index 00a6a11c9..adf26d3ac 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java @@ -18,6 +18,7 @@ import zingg.common.client.Arguments; import zingg.common.client.ArgumentsUtil; +import zingg.common.client.IArguments; import zingg.common.client.IZingg; import zingg.spark.client.ZSparkSession; import zingg.spark.core.util.SparkBlockingTreeUtil; @@ -29,7 +30,7 @@ public class ZinggSparkTester { - public static Arguments args; + public static IArguments args; public static JavaSparkContext ctx; public static SparkSession spark; public static ZinggSparkContext zsCTX; From cda1f9f38bbd4de3fc1ec7a7582061654c23a171 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 18 Dec 2023 19:27:40 +0530 Subject: [PATCH 027/219] extend using IArguments --- .../src/main/java/zingg/common/client/ArgumentsUtil.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java b/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java index df3a7c3c5..97a55c4f2 100644 --- a/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java +++ b/common/client/src/main/java/zingg/common/client/ArgumentsUtil.java @@ -19,7 +19,7 @@ public class ArgumentsUtil { - protected Class argsClass; + protected Class argsClass; private static final String ENV_VAR_MARKER_START = "$"; private static final String ENV_VAR_MARKER_END = "$"; private static final String ESC = "\\"; @@ -31,7 +31,7 @@ public ArgumentsUtil() { this(Arguments.class); } - public ArgumentsUtil( Class argsClass) { + public ArgumentsUtil( Class argsClass) { this.argsClass = argsClass; } From dae6010c2a821d75b0824f61ac4cbe9ee3a4f716 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 27 Dec 2023 18:31:07 +0530 Subject: [PATCH 028/219] generate docs to show concise data if user so asks issue #723 --- .../java/zingg/common/client/Arguments.java | 17 ++++- .../java/zingg/common/client/IArguments.java | 4 ++ .../zingg/common/client/TestArguments.java | 22 ++++++ .../testArguments/configTestDontUse.json | 70 +++++++++++++++++++ .../core/documenter/ModelDocumenter.java | 34 +++++++-- 5 files changed, 142 insertions(+), 5 deletions(-) create mode 100644 common/client/src/test/resources/testArguments/configTestDontUse.json diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index a47323eef..a8b75ba54 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -4,6 +4,7 @@ import java.io.Serializable; import java.io.StringWriter; import java.util.List; +import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -169,7 +170,21 @@ public void setLabelDataSampleSize(float labelDataSampleSize) throws ZinggClient public List getFieldDefinition() { return fieldDefinition; } - + + @JsonIgnore @Override + public List getFieldDefinitionDontUse() { + return fieldDefinition.stream() + .filter(x->x.matchType.contains(MatchType.DONT_USE)) + .collect(Collectors.toList()); + } + + @JsonIgnore @Override + public List getFieldDefinitionToUse() { + return fieldDefinition.stream() + .filter(x->!x.matchType.contains(MatchType.DONT_USE)) + .collect(Collectors.toList()); + } + /** * Set the field definitions consisting of match field indices, types and * classes diff --git a/common/client/src/main/java/zingg/common/client/IArguments.java b/common/client/src/main/java/zingg/common/client/IArguments.java index d0a9849f4..1ec7369ed 100644 --- a/common/client/src/main/java/zingg/common/client/IArguments.java +++ b/common/client/src/main/java/zingg/common/client/IArguments.java @@ -42,6 +42,10 @@ public interface IArguments { */ List getFieldDefinition(); + public List getFieldDefinitionDontUse(); + + public List getFieldDefinitionToUse(); + /** * Set the field definitions consisting of match field indices, types and * classes diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java index 231464c44..8ea1252f4 100644 --- a/common/client/src/test/java/zingg/common/client/TestArguments.java +++ b/common/client/src/test/java/zingg/common/client/TestArguments.java @@ -9,6 +9,7 @@ import java.nio.file.Paths; import java.util.Arrays; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; @@ -227,6 +228,27 @@ public void testMatchTypeMultiple() { } + @Test + public void testMatchTypeFilter() { + IArguments args; + try { + args = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configTestDontUse.json").getFile(), "test"); + + List dontUseList = args.getFieldDefinitionDontUse(); + assertEquals(dontUseList.size(), 3); + + List matchList = args.getFieldDefinitionToUse(); + assertEquals(matchList.size(), 4); + + } catch (Exception | ZinggClientException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + fail("Could not read config"); + } + + } + + @Test public void testMatchTypeWrong() { IArguments args; diff --git a/common/client/src/test/resources/testArguments/configTestDontUse.json b/common/client/src/test/resources/testArguments/configTestDontUse.json new file mode 100644 index 000000000..f1f1ed225 --- /dev/null +++ b/common/client/src/test/resources/testArguments/configTestDontUse.json @@ -0,0 +1,70 @@ +{ + "fieldDefinition":[ + { + "fieldName" : "fname", + "matchType" : "fuzzy,null_or_blank", + "fields" : "fname", + "dataType": "string" + }, + { + "fieldName" : "lname", + "matchType" : "fuzzy", + "fields" : "lname", + "dataType": "string" + }, + { + "fieldName" : "stNo", + "matchType": "exact", + "fields" : "stNo", + "dataType": "string" + }, + { + "fieldName" : "add1", + "matchType": "fuzzy,dont_use", + "fields" : "add1", + "dataType": "string" + }, + { + "fieldName" : "add2", + "matchType": "dont_use", + "fields" : "add2", + "dataType": "string" + }, + { + "fieldName" : "city", + "matchType": "dont_use,fuzzy", + "fields" : "city", + "dataType": "string" + }, + { + "fieldName" : "state", + "matchType": "fuzzy", + "fields" : "state", + "dataType": "string" + } + ], + "output" : [{ + "name":"output", + "format":"csv", + "props": { + "location": "/tmp/zinggOutput", + "delimiter": ",", + "header":true + } + }], + "data" : [{ + "name":"test", + "format":"csv", + "props": { + "location": "examples/febrl/test.csv", + "delimiter": ",", + "header":false + }, + "schema": "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" + }], + "labelDataSampleSize" : 0.5, + "numPartitions":4, + "modelId": 100, + "zinggDir": "models" + +} diff --git a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java index 75363e71c..edaf1f7b0 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java @@ -10,6 +10,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import zingg.common.client.FieldDefinition; import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; @@ -45,8 +46,9 @@ protected void createModelDocument() throws ZinggClientException { try { LOG.info("Model document generation starts"); - markedRecords = getMarkedRecords().sortAscending(ColName.CLUSTER_COLUMN); - unmarkedRecords = getUnmarkedRecords().sortAscending(ColName.CLUSTER_COLUMN); + // drop columns which are don't use if show concise is true + markedRecords = filterForConcise(getMarkedRecords().sortAscending(ColName.CLUSTER_COLUMN)); + unmarkedRecords = filterForConcise(getUnmarkedRecords().sortAscending(ColName.CLUSTER_COLUMN)); Map root = populateTemplateData(); writeModelDocument(root); @@ -82,8 +84,7 @@ protected Map populateTemplateData() { } else { // fields required to generate basic document - List columnList = args.getFieldDefinition().stream().map(fd -> fd.getFieldName()) - .collect(Collectors.toList()); + List columnList = getColumnList(); root.put(TemplateFields.NUM_COLUMNS, columnList.size()); root.put(TemplateFields.COLUMNS, columnList.toArray()); root.put(TemplateFields.CLUSTERS, Collections.emptyList()); @@ -94,6 +95,31 @@ protected Map populateTemplateData() { return root; } + protected ZFrame filterForConcise(ZFrame df) { + if (args.getShowConcise()) { + List dontUseFields = getFieldNames( + (List) args.getFieldDefinitionDontUse()); + if(!dontUseFields.isEmpty()) { + df = df.drop(dontUseFields.toArray(new String[dontUseFields.size()])); + } + } + return df; + } + + protected List getColumnList() { + List fieldList = args.getFieldDefinition(); + //drop columns which are don't use if show concise is true + if (args.getShowConcise()) { + fieldList = args.getFieldDefinitionToUse(); + } + return getFieldNames(fieldList); + } + + protected List getFieldNames(List fieldList) { + return fieldList.stream().map(fd -> fd.getFieldName()) + .collect(Collectors.toList()); + } + private void putSummaryCounts(Map root) { // Get the count if not empty ZFrame markedRecordsPairSummary = markedRecords.groupByCount(ColName.MATCH_FLAG_COL, PAIR_WISE_COUNT); From 5b7caf1d7cc6187f596e1585f78205ce2df03ca6 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 27 Dec 2023 18:43:34 +0530 Subject: [PATCH 029/219] doc updated for show concise option --- docs/generatingdocumentation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/generatingdocumentation.md b/docs/generatingdocumentation.md index 11fb09921..90f7e21d9 100644 --- a/docs/generatingdocumentation.md +++ b/docs/generatingdocumentation.md @@ -3,7 +3,7 @@ Zingg generates readable documentation about the training data, including those marked as matches, as well as non-matches. The documentation is written to the zinggDir/modelId folder and can be built using the following ``` -./scripts/zingg.sh --phase generateDocs --conf +./scripts/zingg.sh --phase generateDocs --conf ``` The generated documentation file can be viewed in a browser and looks like as below. From e0492ed3a2a53deb13857e1d48e17d1241e2228d Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 27 Dec 2023 19:37:40 +0530 Subject: [PATCH 030/219] collectAsListOfStrings changed to collectFirstColumn --- common/client/src/main/java/zingg/common/client/ZFrame.java | 2 +- .../java/zingg/common/core/preprocess/StopWordsRemover.java | 2 +- spark/client/src/main/java/zingg/spark/client/SparkFrame.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ZFrame.java b/common/client/src/main/java/zingg/common/client/ZFrame.java index 1a0861917..d7c2c97ad 100644 --- a/common/client/src/main/java/zingg/common/client/ZFrame.java +++ b/common/client/src/main/java/zingg/common/client/ZFrame.java @@ -20,7 +20,7 @@ public interface ZFrame { public ZFrame selectExpr(String... col); public ZFrame distinct(); public List collectAsList(); - public List collectAsListOfStrings(); + public List collectFirstColumn(); public ZFrame toDF(String[] cols); public ZFrame toDF(String col1, String col2); diff --git a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java index b45c6d250..96a21b309 100644 --- a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java +++ b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java @@ -65,7 +65,7 @@ protected String getStopWordColumnName(ZFrame stopWords) { } protected List getWordList(ZFrame stopWords, String stopWordColumn) { - return stopWords.select(stopWordColumn).collectAsListOfStrings(); + return stopWords.select(stopWordColumn).collectFirstColumn(); } /** diff --git a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java index 18957ac67..6de857dcf 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java @@ -87,7 +87,7 @@ public List collectAsList() { return df.collectAsList(); } - public List collectAsListOfStrings() { + public List collectFirstColumn() { return df.as(Encoders.STRING()).collectAsList(); } From 438a6f29f449fe4cdf5851c831dd231e21a8ca39 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 27 Dec 2023 20:38:27 +0530 Subject: [PATCH 031/219] use method to get the field definition otherwise creating problem in overriding classes --- .../client/src/main/java/zingg/common/client/Arguments.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index a8b75ba54..b94614af8 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -173,14 +173,14 @@ public List getFieldDefinition() { @JsonIgnore @Override public List getFieldDefinitionDontUse() { - return fieldDefinition.stream() + return getFieldDefinition().stream() .filter(x->x.matchType.contains(MatchType.DONT_USE)) .collect(Collectors.toList()); } @JsonIgnore @Override public List getFieldDefinitionToUse() { - return fieldDefinition.stream() + return getFieldDefinition().stream() .filter(x->!x.matchType.contains(MatchType.DONT_USE)) .collect(Collectors.toList()); } From bf2c7328cf106ed071a13b6a7477f34c554e21c8 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 28 Dec 2023 08:50:22 +0530 Subject: [PATCH 032/219] refactor to fieldDefUtil --- .../java/zingg/common/client/Arguments.java | 14 --------- .../zingg/common/client/FieldDefUtil.java | 30 +++++++++++++++++++ .../java/zingg/common/client/IArguments.java | 4 --- .../zingg/common/client/TestArguments.java | 6 ++-- .../core/documenter/ModelDocumenter.java | 8 +++-- 5 files changed, 40 insertions(+), 22 deletions(-) create mode 100644 common/client/src/main/java/zingg/common/client/FieldDefUtil.java diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index b94614af8..3f396f090 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -171,20 +171,6 @@ public List getFieldDefinition() { return fieldDefinition; } - @JsonIgnore @Override - public List getFieldDefinitionDontUse() { - return getFieldDefinition().stream() - .filter(x->x.matchType.contains(MatchType.DONT_USE)) - .collect(Collectors.toList()); - } - - @JsonIgnore @Override - public List getFieldDefinitionToUse() { - return getFieldDefinition().stream() - .filter(x->!x.matchType.contains(MatchType.DONT_USE)) - .collect(Collectors.toList()); - } - /** * Set the field definitions consisting of match field indices, types and * classes diff --git a/common/client/src/main/java/zingg/common/client/FieldDefUtil.java b/common/client/src/main/java/zingg/common/client/FieldDefUtil.java new file mode 100644 index 000000000..c8b06a55f --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/FieldDefUtil.java @@ -0,0 +1,30 @@ +package zingg.common.client; + +import java.io.Serializable; +import java.util.List; +import java.util.stream.Collectors; + +/** + * + * Util methods related to FieldDefinition objects + * + */ +public class FieldDefUtil implements Serializable{ + + private static final long serialVersionUID = 1L; + + public List getFieldDefinitionDontUse(List fieldDefinition) { + return fieldDefinition.stream() + .filter(x->x.matchType.contains(MatchType.DONT_USE)) + .collect(Collectors.toList()); + } + + public List getFieldDefinitionToUse(List fieldDefinition) { + return fieldDefinition.stream() + .filter(x->!x.matchType.contains(MatchType.DONT_USE)) + .collect(Collectors.toList()); + } + + + +} diff --git a/common/client/src/main/java/zingg/common/client/IArguments.java b/common/client/src/main/java/zingg/common/client/IArguments.java index 1ec7369ed..d0a9849f4 100644 --- a/common/client/src/main/java/zingg/common/client/IArguments.java +++ b/common/client/src/main/java/zingg/common/client/IArguments.java @@ -42,10 +42,6 @@ public interface IArguments { */ List getFieldDefinition(); - public List getFieldDefinitionDontUse(); - - public List getFieldDefinitionToUse(); - /** * Set the field definitions consisting of match field indices, types and * classes diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java index 8ea1252f4..9c3780c7e 100644 --- a/common/client/src/test/java/zingg/common/client/TestArguments.java +++ b/common/client/src/test/java/zingg/common/client/TestArguments.java @@ -26,6 +26,8 @@ public class TestArguments { public static final Log LOG = LogFactory.getLog(TestArguments.class); protected ArgumentsUtil argsUtil = new ArgumentsUtil(); + + protected FieldDefUtil fieldDefUtil = new FieldDefUtil(); @Test public void testSubstituteVariablesWithAllEnvVarSet() { try { @@ -234,10 +236,10 @@ public void testMatchTypeFilter() { try { args = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configTestDontUse.json").getFile(), "test"); - List dontUseList = args.getFieldDefinitionDontUse(); + List dontUseList = fieldDefUtil.getFieldDefinitionDontUse(args.getFieldDefinition()); assertEquals(dontUseList.size(), 3); - List matchList = args.getFieldDefinitionToUse(); + List matchList = fieldDefUtil.getFieldDefinitionToUse(args.getFieldDefinition()); assertEquals(matchList.size(), 4); } catch (Exception | ZinggClientException e) { diff --git a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java index edaf1f7b0..e2eba17f9 100644 --- a/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java +++ b/common/core/src/main/java/zingg/common/core/documenter/ModelDocumenter.java @@ -10,6 +10,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import zingg.common.client.FieldDefUtil; import zingg.common.client.FieldDefinition; import zingg.common.client.IArguments; import zingg.common.client.ZFrame; @@ -31,10 +32,13 @@ public abstract class ModelDocumenter extends DocumenterBase modelColDoc; protected ZFrame markedRecords; protected ZFrame unmarkedRecords; + + protected FieldDefUtil fieldDefUtil; public ModelDocumenter(Context context, IArguments args) { super(context, args); markedRecords = getDSUtil().emptyDataFrame(); + fieldDefUtil = new FieldDefUtil(); } public void process() throws ZinggClientException { @@ -98,7 +102,7 @@ protected Map populateTemplateData() { protected ZFrame filterForConcise(ZFrame df) { if (args.getShowConcise()) { List dontUseFields = getFieldNames( - (List) args.getFieldDefinitionDontUse()); + (List) fieldDefUtil.getFieldDefinitionDontUse(args.getFieldDefinition())); if(!dontUseFields.isEmpty()) { df = df.drop(dontUseFields.toArray(new String[dontUseFields.size()])); } @@ -110,7 +114,7 @@ protected List getColumnList() { List fieldList = args.getFieldDefinition(); //drop columns which are don't use if show concise is true if (args.getShowConcise()) { - fieldList = args.getFieldDefinitionToUse(); + fieldList = fieldDefUtil.getFieldDefinitionToUse(args.getFieldDefinition()); } return getFieldNames(fieldList); } From 108843f36b98ba0d788d903adcad9bfb859610a1 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 28 Dec 2023 10:18:31 +0530 Subject: [PATCH 033/219] refactor to TestFieldDefUtil --- .../zingg/common/client/TestArguments.java | 21 ---------- .../zingg/common/client/TestFieldDefUtil.java | 41 +++++++++++++++++++ 2 files changed, 41 insertions(+), 21 deletions(-) create mode 100644 common/client/src/test/java/zingg/common/client/TestFieldDefUtil.java diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java index 9c3780c7e..65bf20583 100644 --- a/common/client/src/test/java/zingg/common/client/TestArguments.java +++ b/common/client/src/test/java/zingg/common/client/TestArguments.java @@ -230,27 +230,6 @@ public void testMatchTypeMultiple() { } - @Test - public void testMatchTypeFilter() { - IArguments args; - try { - args = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configTestDontUse.json").getFile(), "test"); - - List dontUseList = fieldDefUtil.getFieldDefinitionDontUse(args.getFieldDefinition()); - assertEquals(dontUseList.size(), 3); - - List matchList = fieldDefUtil.getFieldDefinitionToUse(args.getFieldDefinition()); - assertEquals(matchList.size(), 4); - - } catch (Exception | ZinggClientException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - fail("Could not read config"); - } - - } - - @Test public void testMatchTypeWrong() { IArguments args; diff --git a/common/client/src/test/java/zingg/common/client/TestFieldDefUtil.java b/common/client/src/test/java/zingg/common/client/TestFieldDefUtil.java new file mode 100644 index 000000000..3d78d4618 --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/TestFieldDefUtil.java @@ -0,0 +1,41 @@ +package zingg.common.client; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.jupiter.api.Test; + + +public class TestFieldDefUtil { + + public static final Log LOG = LogFactory.getLog(TestFieldDefUtil.class); + protected ArgumentsUtil argsUtil = new ArgumentsUtil(); + + protected FieldDefUtil fieldDefUtil = new FieldDefUtil(); + + @Test + public void testMatchTypeFilter() { + IArguments args; + try { + args = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configTestDontUse.json").getFile(), "test"); + + List dontUseList = fieldDefUtil.getFieldDefinitionDontUse(args.getFieldDefinition()); + assertEquals(dontUseList.size(), 3); + + List matchList = fieldDefUtil.getFieldDefinitionToUse(args.getFieldDefinition()); + assertEquals(matchList.size(), 4); + + } catch (Exception | ZinggClientException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + fail("Could not read config"); + } + + } + + +} From 468c09d709af525d52dded1ab206650e6c714d09 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 28 Dec 2023 10:19:58 +0530 Subject: [PATCH 034/219] remove redundant variable --- .../client/src/test/java/zingg/common/client/TestArguments.java | 1 - 1 file changed, 1 deletion(-) diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java index 65bf20583..290bac5d7 100644 --- a/common/client/src/test/java/zingg/common/client/TestArguments.java +++ b/common/client/src/test/java/zingg/common/client/TestArguments.java @@ -27,7 +27,6 @@ public class TestArguments { public static final Log LOG = LogFactory.getLog(TestArguments.class); protected ArgumentsUtil argsUtil = new ArgumentsUtil(); - protected FieldDefUtil fieldDefUtil = new FieldDefUtil(); @Test public void testSubstituteVariablesWithAllEnvVarSet() { try { From a0330d72c78672e0ff71f6dcf957d09bb0fb4579 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Fri, 29 Dec 2023 16:35:58 +0530 Subject: [PATCH 035/219] removed enum zingg options --- .../zingg/common/client/ZinggOptions.java | 57 ------------------- 1 file changed, 57 deletions(-) delete mode 100644 common/client/src/main/java/zingg/common/client/ZinggOptions.java diff --git a/common/client/src/main/java/zingg/common/client/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/ZinggOptions.java deleted file mode 100644 index 8c3d32173..000000000 --- a/common/client/src/main/java/zingg/common/client/ZinggOptions.java +++ /dev/null @@ -1,57 +0,0 @@ -package zingg.common.client; - -import zingg.common.client.util.Util; - -public enum ZinggOptions { - - TRAIN("train"), - MATCH("match"), - TRAIN_MATCH("trainMatch"), - FIND_TRAINING_DATA("findTrainingData"), - LABEL("label"), - LINK("link"), - GENERATE_DOCS("generateDocs"), - RECOMMEND("recommend"), - UPDATE_LABEL("updateLabel"), - FIND_AND_LABEL("findAndLabel"), - ASSESS_MODEL("assessModel"), - PEEK_MODEL("peekModel"), - EXPORT_MODEL("exportModel"), - APPROVE_CLUSTERS("approveClusters"), - RUN_INCREMENTAL("runIncremental"); - - private String value; - - ZinggOptions(String s) { - this.value = s; - } - - public static String[] getAllZinggOptions() { - ZinggOptions[] zo = ZinggOptions.values(); - int i = 0; - String[] s = new String[zo.length]; - for (ZinggOptions z: zo) { - s[i++] = z.getValue(); - } - return s; - } - - public String getValue() { - return value; - } - - public static final ZinggOptions getByValue(String value){ - for (ZinggOptions zo: ZinggOptions.values()) { - if (zo.value.equals(value)) return zo; - } - return null; - } - - public static void verifyPhase(String phase) throws ZinggClientException { - if (getByValue(phase) == null) { - String message = "'" + phase + "' is not a valid phase. " - + "Valid phases are: " + Util.join(getAllZinggOptions(), "|"); - throw new ZinggClientException(message); - } - } -} \ No newline at end of file From 1f42386e911fd2295668926cbab737fd55a1deef Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 1 Jan 2024 17:42:56 +0530 Subject: [PATCH 036/219] changed version to 0.4.1 --- Dockerfile | 10 +++++----- README.md | 11 +++-------- .../src/main/java/zingg/common/client/Client.java | 2 +- .../java/zingg/common/core/executor/ZinggBase.java | 4 ++-- docs/README.md | 1 + docs/stepbystep/installation/docker/README.md | 4 ++-- .../docker/file-read-write-permissions.md | 2 +- .../docker/sharing-custom-data-and-config-files.md | 2 +- .../installing-from-release/installing-zingg.md | 6 +++--- pom.xml | 2 +- python/PKG-INFO | 2 +- python/docs/conf.py | 2 +- python/version.py | 2 +- python/zingg/client.py | 2 +- python/zingg/databricks.py | 2 +- scripts/zingg.sh | 2 +- test/note.txt | 2 +- test/run_tests.sh | 2 +- 18 files changed, 28 insertions(+), 32 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9787725d7..75a65bd08 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,17 +4,17 @@ USER 0 RUN apt-get update && \ apt install -y curl vim ENV SPARK_MASTER local[*] -ENV ZINGG_HOME /zingg-0.4.0 +ENV ZINGG_HOME /zingg-0.4.1 ENV PATH $ZINGG_HOME/scripts:$PATH ENV LANG C.UTF-8 WORKDIR / USER root -WORKDIR /zingg-0.4.0 -RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.4.0/zingg-0.4.0-spark-3.4.0.tar.gz | \ +WORKDIR /zingg-0.4.1 +RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.4.1/zingg-0.4.1-spark-3.4.0.tar.gz | \ tar --extract --gzip --strip=1 RUN pip install -r python/requirements.txt RUN pip install zingg -RUN chmod -R a+rwx /zingg-0.4.0/models -RUN chown -R 1001 /zingg-0.4.0/models +RUN chmod -R a+rwx /zingg-0.4.1/models +RUN chown -R 1001 /zingg-0.4.1/models USER 1001 diff --git a/README.md b/README.md index b6eca735b..2900fa693 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -## On our way to 0.4.0 release of Zingg soon..............stay tuned! +## On our way to 0.4.1 release of Zingg soon..............stay tuned! ## The Problem @@ -60,13 +60,8 @@ See Zingg in action [here](https://www.youtube.com/watch?v=zOabyZxN9b0) The easiest way to get started with Zingg is through Docker and by running the prebuilt models. ``` -<<<<<<< HEAD -docker pull zingg/zingg:0.4.0 -docker run -it zingg/zingg:0.4.0 bash -======= -docker pull zingg/zingg:0.3.4 -docker run -it zingg/zingg:0.3.4 bash ->>>>>>> main +docker pull zingg/zingg:0.4.1 +docker run -it zingg/zingg:0.4.1 bash ./scripts/zingg.sh --phase match --conf examples/febrl/config.json ``` diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 0bf9c05c2..e86377577 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -120,7 +120,7 @@ else if (args.getJobId() != -1) { } public void printBanner() { - String versionStr = "0.4.0"; + String versionStr = "0.4.1"; LOG.info(""); LOG.info("********************************************************"); LOG.info("* Zingg AI *"); diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index efbd2fada..ca100d755 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -86,7 +86,7 @@ public void postMetrics() { Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics); Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics); Analytics.track(Metric.MODEL_ID, args.getModelId(), collectMetrics); - Analytics.track(Metric.ZINGG_VERSION, "0.4.0", collectMetrics); + Analytics.track(Metric.ZINGG_VERSION, "0.4.1", collectMetrics); Analytics.trackEnvProp(Metric.DATABRICKS_RUNTIME_VERSION, collectMetrics); Analytics.trackEnvProp(Metric.DB_INSTANCE_TYPE, collectMetrics); Analytics.trackEnvProp(Metric.JAVA_HOME, collectMetrics); @@ -96,7 +96,7 @@ public void postMetrics() { //Analytics.trackEnvProp(Metric.USER_NAME, collectMetrics); //Analytics.trackEnvProp(Metric.USER_HOME, collectMetrics); Analytics.trackDomain(Metric.DOMAIN, collectMetrics); - Analytics.track(Metric.ZINGG_VERSION, "0.4.0", collectMetrics); + Analytics.track(Metric.ZINGG_VERSION, "0.4.1", collectMetrics); Analytics.postEvent(zinggOptions.getValue(), collectMetrics); } diff --git a/docs/README.md b/docs/README.md index 8a849e159..b610e9f13 100644 --- a/docs/README.md +++ b/docs/README.md @@ -8,6 +8,7 @@ description: Hope you find us useful :-) This is the latest documentation for Zingg. Release wise documentation can be accessed through: +* [v0.4.1 ](https://docs.zingg.ai/zingg0.4.1/) * [v0.4.0 ](https://docs.zingg.ai/zingg0.4.0/) * [v0.3.4 ](https://docs.zingg.ai/zingg0.3.4/) * [v0.3.3](https://docs.zingg.ai/zingg0.3.3/) diff --git a/docs/stepbystep/installation/docker/README.md b/docs/stepbystep/installation/docker/README.md index 13a9f1b30..13c51de42 100644 --- a/docs/stepbystep/installation/docker/README.md +++ b/docs/stepbystep/installation/docker/README.md @@ -9,8 +9,8 @@ description: From pre-built Docker image with all dependencies included The easiest way to get started is to pull the Docker image with the last release of Zingg. ``` -docker pull zingg/zingg:0.4.0 -docker run -it zingg/zingg:0.4.0 bash +docker pull zingg/zingg:0.4.1 +docker run -it zingg/zingg:0.4.1 bash ``` To know more about Docker, please refer to the official [docker documentation](https://docs.docker.com/). diff --git a/docs/stepbystep/installation/docker/file-read-write-permissions.md b/docs/stepbystep/installation/docker/file-read-write-permissions.md index 0c5ec43f0..e7ec5e8b3 100644 --- a/docs/stepbystep/installation/docker/file-read-write-permissions.md +++ b/docs/stepbystep/installation/docker/file-read-write-permissions.md @@ -9,5 +9,5 @@ A docker image is preferred to run with a non-root user. By default, the Zingg c ``` $ id uid=1000(abc) gid=1000(abc) groups=1000(abc) -$ docker run -u -it zingg/zingg:0.4.0 bash +$ docker run -u -it zingg/zingg:0.4.1 bash ``` diff --git a/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md b/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md index c85a615a7..f6e83648a 100644 --- a/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md +++ b/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md @@ -7,7 +7,7 @@ description: Using custom data to save data files on host machine However, note that once the docker container is stopped, all the work done in that session is lost. If we want to use custom data or persist the generated model or data files, we have to use **Volumes** or **Bind mount** to share files between the two. ``` -docker run -v : -it zingg/zingg:0.4.0 bash +docker run -v : -it zingg/zingg:0.4.1 bash ``` The **\** directory from host will get mounted inside container at **\**. Any file written inside this directory will persist on the host machine and can be reused in a new container instance later. diff --git a/docs/stepbystep/installation/installing-from-release/installing-zingg.md b/docs/stepbystep/installation/installing-from-release/installing-zingg.md index 79bf1fc70..0ba07e2e6 100644 --- a/docs/stepbystep/installation/installing-from-release/installing-zingg.md +++ b/docs/stepbystep/installation/installing-from-release/installing-zingg.md @@ -6,13 +6,13 @@ description: Downloading and setting things up Download the tar zingg-version.tar.gz from the [Zingg releases page](https://github.com/zinggAI/zingg/releases) to a folder of your choice and run the following: -> gzip -d zingg-0.4.0.tar.gz ; tar xvf zingg-0.4.0.tar +> gzip -d zingg-0.4.1.tar.gz ; tar xvf zingg-0.4.1.tar -This will create a folder zingg-0.4.0 under the chosen folder. +This will create a folder zingg-0.4.1 under the chosen folder. Move the above folder to zingg. -> mv zingg-0.4.0 \~/zingg +> mv zingg-0.4.1 \~/zingg > export ZINGG\_HOME=path to zingg diff --git a/pom.xml b/pom.xml index d4370e1e2..cd60bffb5 100644 --- a/pom.xml +++ b/pom.xml @@ -64,7 +64,7 @@ - 0.4.0 + 0.4.1 false false 8 diff --git a/python/PKG-INFO b/python/PKG-INFO index fd12b2c4a..dfff445f2 100644 --- a/python/PKG-INFO +++ b/python/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: zingg -Version: 0.4.0 +Version: 0.4.1 Summary: Zingg.ai Entity Resolution Home-page: www.zingg.ai Author: Zingg.AI diff --git a/python/docs/conf.py b/python/docs/conf.py index 81cf8a98b..d96167f0c 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -22,7 +22,7 @@ author = 'Zingg.AI' # The full version, including alpha/beta/rc tags -release = '0.4.0' +release = '0.4.1' # -- General configuration --------------------------------------------------- diff --git a/python/version.py b/python/version.py index 405432f9a..774acba9d 100644 --- a/python/version.py +++ b/python/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python3 -__version__: str = "0.4.0" \ No newline at end of file +__version__: str = "0.4.1" \ No newline at end of file diff --git a/python/zingg/client.py b/python/zingg/client.py index 3c748a978..7d613931a 100644 --- a/python/zingg/client.py +++ b/python/zingg/client.py @@ -20,7 +20,7 @@ _spark_ctxt = None _sqlContext = None _spark = None -_zingg_jar = 'zingg-0.4.0.jar' +_zingg_jar = 'zingg-0.4.1.jar' def initSparkClient(): global _spark_ctxt diff --git a/python/zingg/databricks.py b/python/zingg/databricks.py index 14c21b28e..8d7eb47fa 100644 --- a/python/zingg/databricks.py +++ b/python/zingg/databricks.py @@ -29,7 +29,7 @@ "job_cluster_key": "_cluster", "libraries": [ { - "whl":"dbfs:/FileStore/py/zingg-0.4.0-py2.py3-none-any.whl" + "whl":"dbfs:/FileStore/py/zingg-0.4.1-py2.py3-none-any.whl" }, { "pypi": { diff --git a/scripts/zingg.sh b/scripts/zingg.sh index 53b841ad2..6932c7a7a 100755 --- a/scripts/zingg.sh +++ b/scripts/zingg.sh @@ -1,6 +1,6 @@ #!/bin/bash #ZINGG_HOME=./assembly/target -ZINGG_JARS=$ZINGG_HOME/zingg-0.4.0.jar +ZINGG_JARS=$ZINGG_HOME/zingg-0.4.1.jar EMAIL=zingg@zingg.ai LICENSE=zinggLicense.txt log4j_setting="-Dlog4j2.configurationFile=file:log4j2.properties" diff --git a/test/note.txt b/test/note.txt index 4503641df..686bb0cdc 100644 --- a/test/note.txt +++ b/test/note.txt @@ -5,7 +5,7 @@ To run it: 1. cd test/ 2. pyspark < testInfraOwnGateway.py (or) -2. /opt/spark-3.2.4-bin-hadoop3.2/bin/spark-submit --jars ../common/client/target/zingg-common-client-0.4.0-SNAPSHOT.jar testInfraOwnGateway.py +2. /opt/spark-3.2.4-bin-hadoop3.2/bin/spark-submit --jars ../common/client/target/zingg-common-client-0.4.1-SNAPSHOT.jar testInfraOwnGateway.py If faced version mismatch issue: diff --git a/test/run_tests.sh b/test/run_tests.sh index 93ffcec82..7fc1397d6 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -2,7 +2,7 @@ # Set the paths to your JAR files and Spark binaries SPARK_HOME="/opt/spark-3.2.4-bin-hadoop3.2" -PY4J_JAR="../common/client/target/zingg-common-client-0.4.0.jar" +PY4J_JAR="../common/client/target/zingg-common-client-0.4.1.jar" # Run Spark with the required JAR files and your test script $SPARK_HOME/bin/spark-submit --jars $PY4J_JAR testInfra.py From f795de1b4e92280be14596158feb8b0f09a95348 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 1 Jan 2024 18:15:16 +0530 Subject: [PATCH 037/219] added snapshot to release version --- Dockerfile | 10 +++++----- README.md | 4 ++-- .../src/main/java/zingg/common/client/Client.java | 2 +- .../java/zingg/common/core/executor/ZinggBase.java | 4 ++-- docs/stepbystep/installation/docker/README.md | 4 ++-- .../installation/docker/file-read-write-permissions.md | 2 +- .../docker/sharing-custom-data-and-config-files.md | 2 +- .../installing-from-release/installing-zingg.md | 6 +++--- pom.xml | 2 +- python/docs/conf.py | 2 +- python/zingg/client.py | 2 +- python/zingg/databricks.py | 2 +- scripts/zingg.sh | 2 +- test/run_tests.sh | 4 ++-- 14 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index 75a65bd08..00abbfeed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,17 +4,17 @@ USER 0 RUN apt-get update && \ apt install -y curl vim ENV SPARK_MASTER local[*] -ENV ZINGG_HOME /zingg-0.4.1 +ENV ZINGG_HOME /zingg-0.4.1-SNAPSHOT ENV PATH $ZINGG_HOME/scripts:$PATH ENV LANG C.UTF-8 WORKDIR / USER root -WORKDIR /zingg-0.4.1 -RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.4.1/zingg-0.4.1-spark-3.4.0.tar.gz | \ +WORKDIR /zingg-0.4.1-SNAPSHOT +RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.4.1-SNAPSHOT/zingg-0.4.1-SNAPSHOT-spark-3.5.0.tar.gz | \ tar --extract --gzip --strip=1 RUN pip install -r python/requirements.txt RUN pip install zingg -RUN chmod -R a+rwx /zingg-0.4.1/models -RUN chown -R 1001 /zingg-0.4.1/models +RUN chmod -R a+rwx /zingg-0.4.1-SNAPSHOT/models +RUN chown -R 1001 /zingg-0.4.1-SNAPSHOT/models USER 1001 diff --git a/README.md b/README.md index 2900fa693..44c6a5f36 100644 --- a/README.md +++ b/README.md @@ -60,8 +60,8 @@ See Zingg in action [here](https://www.youtube.com/watch?v=zOabyZxN9b0) The easiest way to get started with Zingg is through Docker and by running the prebuilt models. ``` -docker pull zingg/zingg:0.4.1 -docker run -it zingg/zingg:0.4.1 bash +docker pull zingg/zingg:0.4.1-SNAPSHOT +docker run -it zingg/zingg:0.4.1-SNAPSHOT bash ./scripts/zingg.sh --phase match --conf examples/febrl/config.json ``` diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index e86377577..7ac207c8a 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -120,7 +120,7 @@ else if (args.getJobId() != -1) { } public void printBanner() { - String versionStr = "0.4.1"; + String versionStr = "0.4.1-SNAPSHOT"; LOG.info(""); LOG.info("********************************************************"); LOG.info("* Zingg AI *"); diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index ca100d755..5a5229056 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -86,7 +86,7 @@ public void postMetrics() { Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics); Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics); Analytics.track(Metric.MODEL_ID, args.getModelId(), collectMetrics); - Analytics.track(Metric.ZINGG_VERSION, "0.4.1", collectMetrics); + Analytics.track(Metric.ZINGG_VERSION, "0.4.1-SNAPSHOT", collectMetrics); Analytics.trackEnvProp(Metric.DATABRICKS_RUNTIME_VERSION, collectMetrics); Analytics.trackEnvProp(Metric.DB_INSTANCE_TYPE, collectMetrics); Analytics.trackEnvProp(Metric.JAVA_HOME, collectMetrics); @@ -96,7 +96,7 @@ public void postMetrics() { //Analytics.trackEnvProp(Metric.USER_NAME, collectMetrics); //Analytics.trackEnvProp(Metric.USER_HOME, collectMetrics); Analytics.trackDomain(Metric.DOMAIN, collectMetrics); - Analytics.track(Metric.ZINGG_VERSION, "0.4.1", collectMetrics); + Analytics.track(Metric.ZINGG_VERSION, "0.4.1-SNAPSHOT", collectMetrics); Analytics.postEvent(zinggOptions.getValue(), collectMetrics); } diff --git a/docs/stepbystep/installation/docker/README.md b/docs/stepbystep/installation/docker/README.md index 13c51de42..4cc395341 100644 --- a/docs/stepbystep/installation/docker/README.md +++ b/docs/stepbystep/installation/docker/README.md @@ -9,8 +9,8 @@ description: From pre-built Docker image with all dependencies included The easiest way to get started is to pull the Docker image with the last release of Zingg. ``` -docker pull zingg/zingg:0.4.1 -docker run -it zingg/zingg:0.4.1 bash +docker pull zingg/zingg:0.4.1-SNAPSHOT +docker run -it zingg/zingg:0.4.1-SNAPSHOT bash ``` To know more about Docker, please refer to the official [docker documentation](https://docs.docker.com/). diff --git a/docs/stepbystep/installation/docker/file-read-write-permissions.md b/docs/stepbystep/installation/docker/file-read-write-permissions.md index e7ec5e8b3..a422f3b6a 100644 --- a/docs/stepbystep/installation/docker/file-read-write-permissions.md +++ b/docs/stepbystep/installation/docker/file-read-write-permissions.md @@ -9,5 +9,5 @@ A docker image is preferred to run with a non-root user. By default, the Zingg c ``` $ id uid=1000(abc) gid=1000(abc) groups=1000(abc) -$ docker run -u -it zingg/zingg:0.4.1 bash +$ docker run -u -it zingg/zingg:0.4.1-SNAPSHOT bash ``` diff --git a/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md b/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md index f6e83648a..c81fd3c95 100644 --- a/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md +++ b/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md @@ -7,7 +7,7 @@ description: Using custom data to save data files on host machine However, note that once the docker container is stopped, all the work done in that session is lost. If we want to use custom data or persist the generated model or data files, we have to use **Volumes** or **Bind mount** to share files between the two. ``` -docker run -v : -it zingg/zingg:0.4.1 bash +docker run -v : -it zingg/zingg:0.4.1-SNAPSHOT bash ``` The **\** directory from host will get mounted inside container at **\**. Any file written inside this directory will persist on the host machine and can be reused in a new container instance later. diff --git a/docs/stepbystep/installation/installing-from-release/installing-zingg.md b/docs/stepbystep/installation/installing-from-release/installing-zingg.md index 0ba07e2e6..3308f6a47 100644 --- a/docs/stepbystep/installation/installing-from-release/installing-zingg.md +++ b/docs/stepbystep/installation/installing-from-release/installing-zingg.md @@ -6,13 +6,13 @@ description: Downloading and setting things up Download the tar zingg-version.tar.gz from the [Zingg releases page](https://github.com/zinggAI/zingg/releases) to a folder of your choice and run the following: -> gzip -d zingg-0.4.1.tar.gz ; tar xvf zingg-0.4.1.tar +> gzip -d zingg-0.4.1-SNAPSHOT.tar.gz ; tar xvf zingg-0.4.1-SNAPSHOT.tar -This will create a folder zingg-0.4.1 under the chosen folder. +This will create a folder zingg-0.4.1-SNAPSHOT under the chosen folder. Move the above folder to zingg. -> mv zingg-0.4.1 \~/zingg +> mv zingg-0.4.1-SNAPSHOT \~/zingg > export ZINGG\_HOME=path to zingg diff --git a/pom.xml b/pom.xml index cd60bffb5..955a17ffd 100644 --- a/pom.xml +++ b/pom.xml @@ -64,7 +64,7 @@ - 0.4.1 + 0.4.1-SNAPSHOT false false 8 diff --git a/python/docs/conf.py b/python/docs/conf.py index d96167f0c..a4d14e2c8 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -22,7 +22,7 @@ author = 'Zingg.AI' # The full version, including alpha/beta/rc tags -release = '0.4.1' +release = '0.4.1-SNAPSHOT' # -- General configuration --------------------------------------------------- diff --git a/python/zingg/client.py b/python/zingg/client.py index 7d613931a..7f4eee2f2 100644 --- a/python/zingg/client.py +++ b/python/zingg/client.py @@ -20,7 +20,7 @@ _spark_ctxt = None _sqlContext = None _spark = None -_zingg_jar = 'zingg-0.4.1.jar' +_zingg_jar = 'zingg-0.4.1-SNAPSHOT.jar' def initSparkClient(): global _spark_ctxt diff --git a/python/zingg/databricks.py b/python/zingg/databricks.py index 8d7eb47fa..54404f989 100644 --- a/python/zingg/databricks.py +++ b/python/zingg/databricks.py @@ -37,7 +37,7 @@ } }, { - "jar": "dbfs:/FileStore/jars/zingg_0_4_0.jar" + "jar": "dbfs:/FileStore/jars/zingg_0_4_1_SNAPSHOT.jar" } ], "timeout_seconds": 0, diff --git a/scripts/zingg.sh b/scripts/zingg.sh index 6932c7a7a..2ac64f852 100755 --- a/scripts/zingg.sh +++ b/scripts/zingg.sh @@ -1,6 +1,6 @@ #!/bin/bash #ZINGG_HOME=./assembly/target -ZINGG_JARS=$ZINGG_HOME/zingg-0.4.1.jar +ZINGG_JARS=$ZINGG_HOME/zingg-0.4.1-SNAPSHOT.jar EMAIL=zingg@zingg.ai LICENSE=zinggLicense.txt log4j_setting="-Dlog4j2.configurationFile=file:log4j2.properties" diff --git a/test/run_tests.sh b/test/run_tests.sh index 7fc1397d6..0bd396f5b 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -1,8 +1,8 @@ #!/bin/bash # Set the paths to your JAR files and Spark binaries -SPARK_HOME="/opt/spark-3.2.4-bin-hadoop3.2" -PY4J_JAR="../common/client/target/zingg-common-client-0.4.1.jar" +SPARK_HOME="/opt/spark-3.5.0-bin-hadoop3" +PY4J_JAR="../common/client/target/zingg-common-client-0.4.1-SNAPSHOT.jar" # Run Spark with the required JAR files and your test script $SPARK_HOME/bin/spark-submit --jars $PY4J_JAR testInfra.py From 2a218628decae21c356734a4393f5820161896df Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Tue, 2 Jan 2024 13:05:13 +0530 Subject: [PATCH 038/219] review comments for code coverage --- .../zingg/common/core/model/TestModel.java | 50 ++----------------- .../java/zingg/hash/TestIdentityLong.java | 2 +- .../zingg/hash/TestTrimLastDigitsFloat.java | 30 +++++++++++ 3 files changed, 34 insertions(+), 48 deletions(-) diff --git a/common/core/src/test/java/zingg/common/core/model/TestModel.java b/common/core/src/test/java/zingg/common/core/model/TestModel.java index ebb3122d4..4396897a5 100644 --- a/common/core/src/test/java/zingg/common/core/model/TestModel.java +++ b/common/core/src/test/java/zingg/common/core/model/TestModel.java @@ -1,66 +1,22 @@ package zingg.common.core.model; -import zingg.common.client.ZFrame; - import static org.junit.jupiter.api.Assertions.assertArrayEquals; import org.junit.jupiter.api.Test; -import zingg.common.core.model.Model; - -import java.io.IOException; - public class TestModel { + @Test public void testGetGrid() { - Model model = getInstance(); - double[] result = model.getGrid(1.0, 10.0, 2.0, false); + double[] result = Model.getGrid(1.0, 10.0, 2.0, false); double[] expected = {1.0, 3.0, 5.0, 7.0, 9.0}; assertArrayEquals(expected, result, 0.0); } @Test public void testGetGridForMultiples() { - Model model = getInstance(); - double[] result = model.getGrid(1.0, 10.0, 2.0, true); + double[] result = Model.getGrid(1.0, 10.0, 2.0, true); double[] expected = {1.0, 2.0, 4.0, 8.0}; assertArrayEquals(expected, result, 0.0); } - - private Model getInstance() { - Model model = new Model() { - @Override - public void register(Object spark) { - } - - @Override - public void fit(ZFrame pos, ZFrame neg) { - } - - @Override - public void load(String path) { - } - - @Override - public ZFrame predict(ZFrame data) { - return null; - } - - @Override - public ZFrame predict(ZFrame data, boolean isDrop) { - return null; - } - - @Override - public void save(String path) throws IOException { - } - - @Override - public ZFrame transform(ZFrame input) { - return null; - } - }; - return model; - } - } diff --git a/common/core/src/test/java/zingg/hash/TestIdentityLong.java b/common/core/src/test/java/zingg/hash/TestIdentityLong.java index dab7404a9..03615df9a 100644 --- a/common/core/src/test/java/zingg/hash/TestIdentityLong.java +++ b/common/core/src/test/java/zingg/hash/TestIdentityLong.java @@ -15,7 +15,7 @@ public void testIdentityLong() { } @Test - public void testIdentityLong1() { + public void testNullValue() { IdentityLong value = getInstance(); assertEquals(null, value.call(null)); } diff --git a/common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java b/common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java index 3f9e9ef26..2676dfde4 100644 --- a/common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java +++ b/common/core/src/test/java/zingg/hash/TestTrimLastDigitsFloat.java @@ -26,6 +26,24 @@ public void testTrimLast3DigitsFloat() { assertEquals(543f, value.call(543534.677f)); } + @Test + public void testTrimLast1DigitNegativeFloat() { + TrimLastDigitsFloat value = getInstance(1); + assertEquals(-54354f, value.call(-543534.677f)); + } + + @Test + public void testTrimLast2DigitsNegativeFloat() { + TrimLastDigitsFloat value = getInstance(2); + assertEquals(-5436f, value.call(-543534.677f)); + } + + @Test + public void testTrimLast3DigitsNegativeFloat() { + TrimLastDigitsFloat value = getInstance(3); + assertEquals(-544f, value.call(-543534.677f)); + } + @Test public void testTrimLast3DigitsFloatNaNValue() { TrimLastDigitsFloat value = getInstance(3); @@ -38,6 +56,18 @@ public void testTrimLast3DigitsFloatNullValue() { assertEquals(null, value.call(null)); } + @Test + public void testTrimLast3DigitsNegativeFloatNaNValue() { + TrimLastDigitsFloat value = getInstance(3); + assertEquals(Float.NaN, value.call(Float.NaN)); + } + + @Test + public void testTrimLast3DigitsNegativeFloatNullValue() { + TrimLastDigitsFloat value = getInstance(3); + assertEquals(null, value.call(null)); + } + private TrimLastDigitsFloat getInstance(int num) { return new TrimLastDigitsFloat(num); } From feabdd1008d634235eb77267566b48e3f672f4ba Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 3 Jan 2024 15:59:49 +0530 Subject: [PATCH 039/219] fixed zinggOption merge issue --- .../src/main/java/zingg/common/core/executor/ZinggBase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index cc3706c75..222c620a8 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -98,7 +98,7 @@ public void postMetrics() { //Analytics.trackEnvProp(Metric.USER_HOME, collectMetrics); Analytics.trackDomain(Metric.DOMAIN, collectMetrics); Analytics.track(Metric.ZINGG_VERSION, "0.4.1-SNAPSHOT", collectMetrics); - Analytics.postEvent(zinggOptions.getValue(), collectMetrics); + Analytics.postEvent(zinggOption.getName(), collectMetrics); } public IArguments getArgs() { From b3e78d066e15b0c4833298b307ef765756d3208c Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 9 Jan 2024 16:26:43 +0530 Subject: [PATCH 040/219] refactor : ZinggSparkContext can be sent via constructor --- .../zingg/spark/core/executor/SparkDocumenter.java | 8 ++++++-- .../spark/core/executor/SparkFindAndLabeller.java | 7 +++++-- .../spark/core/executor/SparkLabelUpdater.java | 7 +++++-- .../java/zingg/spark/core/executor/SparkLinker.java | 13 ++++++++----- .../zingg/spark/core/executor/SparkRecommender.java | 8 ++++++-- .../spark/core/executor/SparkTrainMatcher.java | 6 +++++- 6 files changed, 35 insertions(+), 14 deletions(-) diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 875d8c391..c38c7e801 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -27,10 +27,14 @@ public class SparkDocumenter extends Documenter, Row, public static final Log LOG = LogFactory.getLog(SparkDocumenter.class); public SparkDocumenter() { - setZinggOption(ZinggOptions.GENERATE_DOCS); - setContext(new ZinggSparkContext()); + this(new ZinggSparkContext()); } + public SparkDocumenter(ZinggSparkContext sparkContext) { + setZinggOption(ZinggOptions.GENERATE_DOCS); + setContext(sparkContext); + } + @Override public void init(IArguments args) throws ZinggClientException { super.init(args); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index 661be95d1..01f377348 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -24,8 +24,11 @@ public class SparkFindAndLabeller extends FindAndLabeller, public static final Log LOG = LogFactory.getLog(SparkLabelUpdater.class); public SparkLabelUpdater() { - setZinggOption(ZinggOptions.UPDATE_LABEL); - setContext(new ZinggSparkContext()); + this(new ZinggSparkContext()); } + public SparkLabelUpdater(ZinggSparkContext sparkContext) { + setZinggOption(ZinggOptions.UPDATE_LABEL); + setContext(sparkContext); + } @Override public void init(IArguments args) throws ZinggClientException { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index 9aa7db0b0..43e14423a 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -5,18 +5,17 @@ import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataType; import zingg.common.client.IArguments; import zingg.common.client.ZinggClientException; import zingg.common.client.options.ZinggOptions; - import zingg.common.core.executor.Linker; import zingg.common.core.model.Model; import zingg.common.core.preprocess.StopWordsRemover; -import org.apache.spark.sql.SparkSession; -import zingg.spark.core.preprocess.SparkStopWordsRemover; import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.preprocess.SparkStopWordsRemover; public class SparkLinker extends Linker, Row, Column,DataType> { @@ -26,10 +25,14 @@ public class SparkLinker extends Linker, Row, Column, public static final Log LOG = LogFactory.getLog(SparkLinker.class); public SparkLinker() { - setZinggOption(ZinggOptions.LINK); - setContext(new ZinggSparkContext()); + this(new ZinggSparkContext()); } + public SparkLinker(ZinggSparkContext sparkContext) { + setZinggOption(ZinggOptions.LINK); + setContext(sparkContext); + } + @Override public void init(IArguments args) throws ZinggClientException { super.init(args); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 70c2cf69a..0dc03bd9e 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -29,10 +29,14 @@ public class SparkRecommender extends Recommender, Ro public static final Log LOG = LogFactory.getLog(SparkRecommender.class); public SparkRecommender() { - setZinggOption(ZinggOptions.RECOMMEND); - setContext(new ZinggSparkContext()); + this(new ZinggSparkContext()); } + public SparkRecommender(ZinggSparkContext sparkContext) { + setZinggOption(ZinggOptions.RECOMMEND); + setContext(sparkContext); + } + @Override public void init(IArguments args) throws ZinggClientException { super.init(args); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index 7efb6ddc1..a2772a124 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -22,8 +22,12 @@ public class SparkTrainMatcher extends TrainMatcher, public static final Log LOG = LogFactory.getLog(SparkTrainMatcher.class); public SparkTrainMatcher() { + this(new ZinggSparkContext()); + } + + + public SparkTrainMatcher(ZinggSparkContext sparkContext) { setZinggOption(ZinggOptions.TRAIN_MATCH); - ZinggSparkContext sparkContext = new ZinggSparkContext(); setContext(sparkContext); trainer = new SparkTrainer(sparkContext); matcher = new SparkMatcher(sparkContext); From e3a0cc4f3a2493dc0e1f7277f8707bea7d49f1c7 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 9 Jan 2024 16:46:25 +0530 Subject: [PATCH 041/219] making methods public to be accessible by sub classes --- .../main/java/zingg/spark/core/executor/SparkDocumenter.java | 4 ++-- .../java/zingg/spark/core/executor/SparkLabelUpdater.java | 2 +- .../src/main/java/zingg/spark/core/executor/SparkTrainer.java | 2 +- .../zingg/spark/core/executor/SparkTrainingDataFinder.java | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index c38c7e801..5de555464 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -42,13 +42,13 @@ public void init(IArguments args) throws ZinggClientException { } @Override - protected ModelDocumenter, Row, Column, DataType> getModelDocumenter() { + public ModelDocumenter, Row, Column, DataType> getModelDocumenter() { return new SparkModelDocumenter(getContext(),getArgs()); } @Override - protected DataDocumenter, Row, Column, DataType> getDataDocumenter() { + public DataDocumenter, Row, Column, DataType> getDataDocumenter() { return new SparkDataDocumenter(getContext(),getArgs()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java index 849d7cfd4..bf24ea9a4 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java @@ -44,7 +44,7 @@ public void init(IArguments args) throws ZinggClientException { getContext().init(); } - protected Pipe setSaveModeOnPipe(Pipe p) { + public Pipe setSaveModeOnPipe(Pipe p) { p.setMode(SaveMode.Overwrite.toString()); return p; } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index 371b7e6d4..6d93abc2b 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -41,7 +41,7 @@ public void init(IArguments args) throws ZinggClientException { } @Override - protected StopWordsRemover, Row, Column, DataType> getStopWords() { + public StopWordsRemover, Row, Column, DataType> getStopWords() { return new SparkStopWordsRemover(getContext(),getArgs()); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index 8aea1b39b..bffac326c 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -38,7 +38,7 @@ public void init(IArguments args) throws ZinggClientException { } @Override - protected StopWordsRemover, Row, Column, DataType> getStopWords() { + public StopWordsRemover, Row, Column, DataType> getStopWords() { return new SparkStopWordsRemover(getContext(),getArgs()); } From 98b75dc1db614b946d34516de60f09c17fddc9f6 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Tue, 16 Jan 2024 18:33:26 +0530 Subject: [PATCH 042/219] Update Dependencies version --- assembly/dependency-reduced-pom.xml | 2 +- assembly/pom.xml | 2 +- common/core/pom.xml | 2 +- spark/client/pom.xml | 6 +++--- spark/core/pom.xml | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/assembly/dependency-reduced-pom.xml b/assembly/dependency-reduced-pom.xml index 2e7dabf83..041aa6573 100644 --- a/assembly/dependency-reduced-pom.xml +++ b/assembly/dependency-reduced-pom.xml @@ -45,7 +45,7 @@ maven-assembly-plugin - 2.4.1 + 3.6.0 make-assembly diff --git a/assembly/pom.xml b/assembly/pom.xml index b1282eda8..7027c77a8 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -112,7 +112,7 @@ maven-assembly-plugin - 2.4.1 + 3.6.0 ${project.basedir}/src/assembly/dist.xml diff --git a/common/core/pom.xml b/common/core/pom.xml index 926187b45..1f03d694c 100644 --- a/common/core/pom.xml +++ b/common/core/pom.xml @@ -27,7 +27,7 @@ org.apache.httpcomponents httpclient - 4.5.2 + 4.5.14 diff --git a/spark/client/pom.xml b/spark/client/pom.xml index 418abd6e8..0515ed51b 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -8,8 +8,8 @@ zingg-spark-client jar - 2.12.6 - 2.12.6.1 + 2.15.2 + 2.15.2 @@ -46,7 +46,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.9.1 + 3.5.0 ${basedir}/src/main/java/zingg/client diff --git a/spark/core/pom.xml b/spark/core/pom.xml index 3034dd70b..3674129b2 100644 --- a/spark/core/pom.xml +++ b/spark/core/pom.xml @@ -32,7 +32,7 @@ net.alchim31.maven scala-maven-plugin - 3.2.2 + 4.8.0 scala-compile-first From fa65b3e6a091cf2b8c6c73243d3dcbcb7acfad74 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 24 Jan 2024 13:50:40 +0530 Subject: [PATCH 043/219] Annotations to Generate py codes --- common/client/pom.xml | 28 ++++++ .../java/zingg/common/client/pipe/Pipe.java | 9 +- common/pom.xml | 1 + common/py/pom.xml | 10 ++ .../common/py/annotations/PythonClass.java | 9 ++ .../common/py/annotations/PythonMethod.java | 9 ++ .../py/processors/PythonClassProcessor.java | 91 +++++++++++++++++++ .../py/processors/PythonMethodProcessor.java | 76 ++++++++++++++++ 8 files changed, 230 insertions(+), 3 deletions(-) create mode 100644 common/py/pom.xml create mode 100644 common/py/src/main/java/zingg/common/py/annotations/PythonClass.java create mode 100644 common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java create mode 100644 common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java create mode 100644 common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java diff --git a/common/client/pom.xml b/common/client/pom.xml index c67339949..d8e3bafd3 100644 --- a/common/client/pom.xml +++ b/common/client/pom.xml @@ -8,10 +8,38 @@ zingg-common-client jar + + zingg + zingg-common-py + ${zingg.version} + javax.mail mail 1.4 + + + + + maven-compiler-plugin + ${maven-compiler-plugin.version} + + ${maven.compiler.source} + ${maven.compiler.source} + true + + + + zingg.common.py.processors.PythonClassProcessor + + + zingg.common.py.processors.PythonMethodProcessor + + + + + + diff --git a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java index 7a4f8ff88..99e80c9cc 100644 --- a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java +++ b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java @@ -11,13 +11,16 @@ import zingg.common.client.ZFrame; import zingg.common.client.util.StringRedactor; +import zingg.common.py.annotations.PythonClass; +import zingg.common.py.annotations.PythonMethod; + /**Actual pipe def in the args. One pipe can be used at multiple places with different tables, locations, queries etc * * @author sgoyal * */ - +@PythonClass @JsonInclude(Include.NON_NULL) public class Pipe implements Serializable{ // St:StructType, Sv:SaveMode @@ -57,12 +60,12 @@ public void setSchema(String schema) { this.schema = schema; } - + @PythonMethod public String getName() { return name; } - + @PythonMethod @JsonValue public void setName(String name) { this.name = name; diff --git a/common/pom.xml b/common/pom.xml index 23dd19064..c50c2b037 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -12,5 +12,6 @@ infra core client + py diff --git a/common/py/pom.xml b/common/py/pom.xml new file mode 100644 index 000000000..bde63932b --- /dev/null +++ b/common/py/pom.xml @@ -0,0 +1,10 @@ + + 4.0.0 + + zingg + zingg-common + ${zingg.version} + + zingg-common-py + \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java new file mode 100644 index 000000000..0d3bf21a5 --- /dev/null +++ b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java @@ -0,0 +1,9 @@ +package zingg.common.py.annotations; + +import javax.annotation.processing.*; + +import java.lang.annotation.Target; +import java.lang.annotation.ElementType; + +@Target({ElementType.TYPE}) +public @interface PythonClass {} \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java b/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java new file mode 100644 index 000000000..f59a9c038 --- /dev/null +++ b/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java @@ -0,0 +1,9 @@ +package zingg.common.py.annotations; + +import javax.annotation.processing.*; + +import java.lang.annotation.Target; +import java.lang.annotation.ElementType; + +@Target({ElementType.METHOD}) +public @interface PythonMethod {} \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java new file mode 100644 index 000000000..1090628a2 --- /dev/null +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -0,0 +1,91 @@ +package zingg.common.py.processors; + +import java.util.List; +import javax.annotation.processing.*; +import javax.lang.model.type.TypeMirror; +import javax.lang.model.type.TypeKind; +import java.util.Set; +import javax.lang.model.element.*; +import javax.lang.model.util.ElementFilter; + +import zingg.common.py.annotations.*; + +@SupportedAnnotationTypes("zingg.common.py.annotations.PythonClass") +public class PythonClassProcessor extends AbstractProcessor { + + private boolean importsAndDeclarationsGenerated = false; + + @Override + public boolean process(Set annotations, RoundEnvironment roundEnv) { + + // Imports and global declarations + if (!importsAndDeclarationsGenerated) { + generateImportsAndDeclarations(); + importsAndDeclarationsGenerated = true; + } + + + // process Services annotation + for (Element element : roundEnv.getElementsAnnotatedWith(PythonClass.class)) { + if (element.getKind() == ElementKind.CLASS) { + TypeElement classElement = (TypeElement) element; + PackageElement packageElement = + (PackageElement) classElement.getEnclosingElement(); + System.out.println("class " + element.getSimpleName() + ":"); + + // __init__ method + System.out.println(" def __init__(self" + + generateConstructorParameters(classElement) + "):"); + if (element.getSimpleName().contentEquals("EPipe")) { + generateClassInitializationCode(classElement); + } + for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { + if (!field.getSimpleName().contentEquals("serialVersionUID")) { + generateFieldInitializationCode(field); + } + } + } + System.out.println(); + // rest of generated class contents + } + + return false; + + } + + private void generateImportsAndDeclarations() { + System.out.println("import logging"); + System.out.println("from zingg.client import *"); + System.out.println("LOG = logging.getLogger(\"zingg.pipes\")"); + System.out.println(); + System.out.println("JPipe = getJVM().zingg.spark.client.pipe.SparkPipe"); + System.out.println("FilePipe = getJVM().zingg.common.client.pipe.FilePipe"); + System.out.println("JStructType = getJVM().org.apache.spark.sql.types.StructType"); + System.out.println(); + } + + private void generateClassInitializationCode(TypeElement classElement) { + System.out.println(" self.EPipe = getJVM().zingg.spark.client.pipe.SparkPipe()"); + } + + // private void generateFieldInitializationCode(VariableElement field, ExecutableElement methodElement, TypeElement classElement) { + private void generateFieldInitializationCode(VariableElement field) { + System.out.println(" self.EPipe." + field.getSimpleName() + " = " + field.getSimpleName()); + // String fieldName = field.getSimpleName().toString(); + // String methodName = methodElement.getSimpleName().toString(); + // System.out.println(" self." + fieldName + " = " + "getJVM()." + + // classElement.getQualifiedName().toString() + "." + methodName + "(" + fieldName + ")"); + } + + private String generateConstructorParameters(TypeElement classElement) { + StringBuilder parameters = new StringBuilder(); + for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { + if (!field.getSimpleName().contentEquals("serialVersionUID")) { + parameters.append(", "); + parameters.append(field.getSimpleName()); + } + } + return parameters.toString(); + } + +} diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java new file mode 100644 index 000000000..fe0b02747 --- /dev/null +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -0,0 +1,76 @@ +package zingg.common.py.processors; + +import java.util.List; +import javax.annotation.processing.*; +import javax.lang.model.type.TypeMirror; +import javax.lang.model.type.TypeKind; +import java.util.Set; +import javax.lang.model.element.*; +import javax.lang.model.util.ElementFilter; + +import zingg.common.py.annotations.*; + +@SupportedAnnotationTypes("zingg.common.py.annotations.PythonMethod") +public class PythonMethodProcessor extends AbstractProcessor { + + private boolean importsAndDeclarationsGenerated = false; + + @Override + public boolean process(Set annotations, RoundEnvironment roundEnv) { + + // process Services annotation + for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { + + if (element.getKind() == ElementKind.METHOD) { + ExecutableElement methodElement = (ExecutableElement) element; + System.out.println(" def " + methodElement.getSimpleName() + "(self" + + generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); + generateFieldAssignment(methodElement); + } + System.out.println(); + + // rest of generated class contents + } + return false; + } + + private String generateMethodSignature(ExecutableElement methodElement) { + StringBuilder signature = new StringBuilder(); + signature.append(generateMethodParameters(methodElement)); + return signature.toString(); + } + + private String generateMethodParameters(ExecutableElement methodElement) { + StringBuilder parameters = new StringBuilder(); + for (VariableElement parameter : methodElement.getParameters()) { + parameters.append(", "); + parameters.append(parameter.getSimpleName()); + } + return parameters.toString(); + } + + private String generateMethodReturn(ExecutableElement methodElement) { + TypeMirror returnType = methodElement.getReturnType(); + if (returnType.getKind() == TypeKind.VOID) { + return ""; + } else { + String returnTypeString = resolveType(returnType); + String variableName = methodElement.getSimpleName().toString(); + return "return " + variableName; + } + } + + private String resolveType(TypeMirror typeMirror) { + return typeMirror.toString(); + } + + private void generateFieldAssignment(ExecutableElement methodElement) { + List parameters = methodElement.getParameters(); + if (!parameters.isEmpty()) { + VariableElement parameter = parameters.get(0); + String variableName = parameter.getSimpleName().toString(); + System.out.println(" self." + variableName + " = " + variableName); + } +} + +} From 2c7027d1f15b217f0d1abe65acf6d8f9496f9172 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 24 Jan 2024 13:54:54 +0530 Subject: [PATCH 044/219] Annotations to Generate py codes --- .../zingg/common/py/processors/PythonClassProcessor.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 1090628a2..89d29ef55 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -36,7 +36,7 @@ public boolean process(Set annotations, RoundEnvironment // __init__ method System.out.println(" def __init__(self" + generateConstructorParameters(classElement) + "):"); - if (element.getSimpleName().contentEquals("EPipe")) { + if (element.getSimpleName().contentEquals("pipe")) { generateClassInitializationCode(classElement); } for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { @@ -65,12 +65,12 @@ private void generateImportsAndDeclarations() { } private void generateClassInitializationCode(TypeElement classElement) { - System.out.println(" self.EPipe = getJVM().zingg.spark.client.pipe.SparkPipe()"); + System.out.println(" self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe()"); } // private void generateFieldInitializationCode(VariableElement field, ExecutableElement methodElement, TypeElement classElement) { private void generateFieldInitializationCode(VariableElement field) { - System.out.println(" self.EPipe." + field.getSimpleName() + " = " + field.getSimpleName()); + System.out.println(" self.pipe." + field.getSimpleName() + " = " + field.getSimpleName()); // String fieldName = field.getSimpleName().toString(); // String methodName = methodElement.getSimpleName().toString(); // System.out.println(" self." + fieldName + " = " + "getJVM()." + From 17baef2ea8c9616bb0eeea510574ef2c30c25080 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Thu, 25 Jan 2024 17:55:54 +0530 Subject: [PATCH 045/219] updated the setter functions in annotations --- .../java/zingg/common/client/pipe/Pipe.java | 6 +- .../py/processors/ProcessorContext.java | 22 ++++++ .../py/processors/PythonClassProcessor.java | 79 +++++++++++++------ .../py/processors/PythonMethodProcessor.java | 56 +++++++++---- 4 files changed, 124 insertions(+), 39 deletions(-) create mode 100644 common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java diff --git a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java index 99e80c9cc..aab0878b1 100644 --- a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java +++ b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java @@ -55,7 +55,7 @@ public String getSchema() { return schema; } - + @PythonMethod public void setSchema(String schema) { this.schema = schema; } @@ -71,10 +71,12 @@ public void setName(String name) { this.name = name; } + @PythonMethod public String getFormat() { return format; } + @PythonMethod @JsonValue public void setFormat(String sinkType) { this.format = sinkType; @@ -90,6 +92,7 @@ public Map getProps() { return props; } + @PythonMethod public void setProp(String k, String v) { if (props == null) props = new HashMap(); this.props.put(k, v); @@ -134,6 +137,7 @@ public void setDataset(ZFrame ds){ this.dataset = ds; } + @PythonMethod @Override public String toString() { StringRedactor redactor = new StringRedactor(); diff --git a/common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java b/common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java new file mode 100644 index 000000000..6131d51ed --- /dev/null +++ b/common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java @@ -0,0 +1,22 @@ +package zingg.common.py.processors; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class ProcessorContext { + private static final ProcessorContext INSTANCE = new ProcessorContext(); + + private Map> classMethodsMap = new HashMap<>(); + + private ProcessorContext() { + } + + public static ProcessorContext getInstance() { + return INSTANCE; + } + + public Map> getClassMethodsMap() { + return classMethodsMap; + } +} diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 89d29ef55..e9cd37454 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -1,10 +1,14 @@ package zingg.common.py.processors; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; + import javax.annotation.processing.*; -import javax.lang.model.type.TypeMirror; -import javax.lang.model.type.TypeKind; import java.util.Set; +import java.util.stream.Collectors; + import javax.lang.model.element.*; import javax.lang.model.util.ElementFilter; @@ -14,6 +18,12 @@ public class PythonClassProcessor extends AbstractProcessor { private boolean importsAndDeclarationsGenerated = false; + private Map> classMethodsMap = new HashMap<>(); + + @Override + public synchronized void init(ProcessingEnvironment processingEnv) { + super.init(processingEnv); + } @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { @@ -29,28 +39,40 @@ public boolean process(Set annotations, RoundEnvironment for (Element element : roundEnv.getElementsAnnotatedWith(PythonClass.class)) { if (element.getKind() == ElementKind.CLASS) { TypeElement classElement = (TypeElement) element; - PackageElement packageElement = - (PackageElement) classElement.getEnclosingElement(); + PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); + List methodNames = new ArrayList<>(); + System.out.println("class " + element.getSimpleName() + ":"); // __init__ method System.out.println(" def __init__(self" + generateConstructorParameters(classElement) + "):"); - if (element.getSimpleName().contentEquals("pipe")) { - generateClassInitializationCode(classElement); + if (element.getSimpleName().contentEquals("Pipe")) { + generateClassInitializationCode(classElement, element); } for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { if (!field.getSimpleName().contentEquals("serialVersionUID")) { - generateFieldInitializationCode(field); + generateFieldInitializationCode(field, element); + } + } + for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { + if (methodElement.getAnnotation(PythonMethod.class) != null) { + methodNames.add(methodElement.getSimpleName().toString()); } } + classMethodsMap.put(element.getSimpleName().toString(), methodNames); } System.out.println(); - // rest of generated class contents + // rest of generated class contents } + ProcessorContext processorContext = ProcessorContext.getInstance(); + processorContext.getClassMethodsMap().putAll(classMethodsMap); return false; + } + Map> getClassMethodsMap() { + return classMethodsMap; } private void generateImportsAndDeclarations() { @@ -64,28 +86,41 @@ private void generateImportsAndDeclarations() { System.out.println(); } - private void generateClassInitializationCode(TypeElement classElement) { - System.out.println(" self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe()"); + private void generateClassInitializationCode(TypeElement classElement, Element element) { + System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); } - // private void generateFieldInitializationCode(VariableElement field, ExecutableElement methodElement, TypeElement classElement) { - private void generateFieldInitializationCode(VariableElement field) { - System.out.println(" self.pipe." + field.getSimpleName() + " = " + field.getSimpleName()); - // String fieldName = field.getSimpleName().toString(); - // String methodName = methodElement.getSimpleName().toString(); - // System.out.println(" self." + fieldName + " = " + "getJVM()." + - // classElement.getQualifiedName().toString() + "." + methodName + "(" + fieldName + ")"); + private void generateFieldInitializationCode(VariableElement field, Element element) { + String fieldName = field.getSimpleName().toString(); + String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; + + if (!fieldName.startsWith("FORMAT_")) { + System.out.println(" " + fieldAssignment); + } } private String generateConstructorParameters(TypeElement classElement) { StringBuilder parameters = new StringBuilder(); - for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { - if (!field.getSimpleName().contentEquals("serialVersionUID")) { - parameters.append(", "); - parameters.append(field.getSimpleName()); - } + List fields = ElementFilter.fieldsIn(classElement.getEnclosedElements()); + + fields = fields.stream() + .filter(field -> !field.getSimpleName().contentEquals("serialVersionUID")) + .filter(this::isFieldForConstructor) + .collect(Collectors.toList()); + + for (VariableElement field : fields) { + parameters.append(", "); + parameters.append(field.getSimpleName()); } return parameters.toString(); } + + private boolean isFieldForConstructor(VariableElement field) { + String fieldName = field.getSimpleName().toString(); + + return !fieldName.equals(fieldName.toUpperCase()) + && !field.getModifiers().contains(Modifier.STATIC) + && !fieldName.startsWith("FORMAT_"); + } } diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index fe0b02747..5a499795f 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -1,36 +1,51 @@ package zingg.common.py.processors; import java.util.List; +import java.util.Map; + import javax.annotation.processing.*; import javax.lang.model.type.TypeMirror; import javax.lang.model.type.TypeKind; import java.util.Set; -import javax.lang.model.element.*; -import javax.lang.model.util.ElementFilter; +import java.util.logging.Logger; +import javax.lang.model.element.*; import zingg.common.py.annotations.*; @SupportedAnnotationTypes("zingg.common.py.annotations.PythonMethod") public class PythonMethodProcessor extends AbstractProcessor { - private boolean importsAndDeclarationsGenerated = false; - + private Map> classMethodsMap; + // private static final Logger LOG = Logger.getLogger(PythonMethodProcessor.class.getName()); + @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { + ProcessorContext processorContext = ProcessorContext.getInstance(); + classMethodsMap = processorContext.getClassMethodsMap(); + // LOG.info("Processing PythonMethod annotations..."); + // process Services annotation for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { if (element.getKind() == ElementKind.METHOD) { ExecutableElement methodElement = (ExecutableElement) element; - System.out.println(" def " + methodElement.getSimpleName() + "(self" + - generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); - generateFieldAssignment(methodElement); - } - System.out.println(); + String className = methodElement.getEnclosingElement().getSimpleName().toString(); - // rest of generated class contents + if (classMethodsMap.containsKey(className)) { + List methodNames = classMethodsMap.get(className); + + if (methodNames.contains(methodElement.getSimpleName().toString())) { + // LOG.info("Generating Python method for: " + methodElement.getSimpleName()); + System.out.println(" def " + methodElement.getSimpleName() + "(self" + + generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); + generateFieldAssignment(methodElement); + } + } + } + System.out.println(); } + // LOG.info("Processing complete."); return false; } @@ -65,12 +80,21 @@ private String resolveType(TypeMirror typeMirror) { } private void generateFieldAssignment(ExecutableElement methodElement) { - List parameters = methodElement.getParameters(); - if (!parameters.isEmpty()) { - VariableElement parameter = parameters.get(0); - String variableName = parameter.getSimpleName().toString(); - System.out.println(" self." + variableName + " = " + variableName); + List parameters = methodElement.getParameters(); + + if (!parameters.isEmpty()) { + String methodName = methodElement.getSimpleName().toString(); + String className = methodElement.getEnclosingElement().getSimpleName().toString(); + + StringBuilder parameterList = new StringBuilder(); + for (VariableElement parameter : parameters) { + if (parameterList.length() > 0) { + parameterList.append(", "); + } + parameterList.append(parameter.getSimpleName()); + } + System.out.println(" self." + className.toLowerCase() + "." + methodName + "(" + parameterList + ")"); + } } -} } From 62e2933baba7f10cfdad1639313fcb18ea35607c Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Fri, 26 Jan 2024 18:19:11 +0530 Subject: [PATCH 046/219] Update Annotations to generate code --- .../py/processors/PythonClassProcessor.java | 34 ++++++++++--------- .../py/processors/PythonMethodProcessor.java | 14 ++++---- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index e9cd37454..d0d51c9c7 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -47,14 +47,14 @@ public boolean process(Set annotations, RoundEnvironment // __init__ method System.out.println(" def __init__(self" + generateConstructorParameters(classElement) + "):"); - if (element.getSimpleName().contentEquals("Pipe")) { - generateClassInitializationCode(classElement, element); - } - for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { - if (!field.getSimpleName().contentEquals("serialVersionUID")) { - generateFieldInitializationCode(field, element); - } - } + generateClassInitializationCode(classElement, element); + + // for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { + // if (!field.getSimpleName().contentEquals("serialVersionUID")) { + // generateFieldInitializationCode(field, element); + // } + // } + for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { if (methodElement.getAnnotation(PythonMethod.class) != null) { methodNames.add(methodElement.getSimpleName().toString()); @@ -87,17 +87,19 @@ private void generateImportsAndDeclarations() { } private void generateClassInitializationCode(TypeElement classElement, Element element) { - System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); + if (element.getSimpleName().contentEquals("Pipe")) { + System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); + } } - private void generateFieldInitializationCode(VariableElement field, Element element) { - String fieldName = field.getSimpleName().toString(); - String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; + // private void generateFieldInitializationCode(VariableElement field, Element element) { + // String fieldName = field.getSimpleName().toString(); + // String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; - if (!fieldName.startsWith("FORMAT_")) { - System.out.println(" " + fieldAssignment); - } - } + // if (!fieldName.startsWith("FORMAT_")) { + // System.out.println(" " + fieldAssignment); + // } + // } private String generateConstructorParameters(TypeElement classElement) { StringBuilder parameters = new StringBuilder(); diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 5a499795f..183b6458d 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -7,7 +7,7 @@ import javax.lang.model.type.TypeMirror; import javax.lang.model.type.TypeKind; import java.util.Set; -import java.util.logging.Logger; +// import java.util.logging.Logger; import javax.lang.model.element.*; import zingg.common.py.annotations.*; @@ -38,7 +38,8 @@ public boolean process(Set annotations, RoundEnvironment if (methodNames.contains(methodElement.getSimpleName().toString())) { // LOG.info("Generating Python method for: " + methodElement.getSimpleName()); System.out.println(" def " + methodElement.getSimpleName() + "(self" + - generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); + generateMethodSignature(methodElement) + "):"); + generateMethodReturn(methodElement); generateFieldAssignment(methodElement); } } @@ -64,14 +65,15 @@ private String generateMethodParameters(ExecutableElement methodElement) { return parameters.toString(); } - private String generateMethodReturn(ExecutableElement methodElement) { + private void generateMethodReturn(ExecutableElement methodElement) { TypeMirror returnType = methodElement.getReturnType(); if (returnType.getKind() == TypeKind.VOID) { - return ""; + return; } else { String returnTypeString = resolveType(returnType); - String variableName = methodElement.getSimpleName().toString(); - return "return " + variableName; + String methodName = methodElement.getSimpleName().toString(); + String className = methodElement.getEnclosingElement().getSimpleName().toString(); + System.out.println(" return self." + className.toLowerCase() + "." + methodName + "()"); } } From 3722be3f79d010e947b3e3fe982dea413b07d4bb Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Thu, 1 Feb 2024 15:56:47 +0530 Subject: [PATCH 047/219] Generated python apis using annotations --- .../java/zingg/common/client/Arguments.java | 22 +- .../zingg/common/client/FieldDefinition.java | 10 +- .../py/processors/PythonClassProcessor.java | 115 ++-- .../py/processors/PythonMethodProcessor.java | 30 +- examples/febrl/GeneratedFebrlExample.py | 46 ++ python/zingg/ArgumentsGenerated.py | 44 ++ python/zingg/FieldDefinitionGenerated.py | 24 + python/zingg/PipeGenerated.py | 35 ++ python/zingg/otherThanGenerated.py | 510 ++++++++++++++++++ python/zingg/otherThanGeneratedArguments.py | 56 ++ .../otherThanGeneratedFieldDefinition.py | 20 + python/zingg/otherThanGeneratedPipe.py | 228 ++++++++ 12 files changed, 1072 insertions(+), 68 deletions(-) create mode 100644 examples/febrl/GeneratedFebrlExample.py create mode 100644 python/zingg/ArgumentsGenerated.py create mode 100644 python/zingg/FieldDefinitionGenerated.py create mode 100644 python/zingg/PipeGenerated.py create mode 100644 python/zingg/otherThanGenerated.py create mode 100644 python/zingg/otherThanGeneratedArguments.py create mode 100644 python/zingg/otherThanGeneratedFieldDefinition.py create mode 100644 python/zingg/otherThanGeneratedPipe.py diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index 3f396f090..5116a5cd9 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -17,6 +17,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import zingg.common.client.pipe.Pipe; +import zingg.common.py.annotations.PythonClass; +import zingg.common.py.annotations.PythonMethod; /** @@ -79,6 +81,7 @@ * } * */ +@PythonClass @JsonInclude(Include.NON_NULL) public class Arguments implements Serializable, IArguments { @@ -121,7 +124,7 @@ public Arguments() { public int getNumPartitions() { return numPartitions; } - + @PythonMethod @Override public void setNumPartitions(int numPartitions) throws ZinggClientException{ if (numPartitions != -1 && numPartitions <= 0) @@ -154,6 +157,7 @@ public float getLabelDataSampleSize() { * generating seed samples * @throws ZinggClientException */ + @PythonMethod @Override public void setLabelDataSampleSize(float labelDataSampleSize) throws ZinggClientException { if (labelDataSampleSize > 1 || labelDataSampleSize < 0) @@ -235,12 +239,12 @@ public void setZinggInternal(Pipe[] zinggDir) { */ - + @PythonMethod @Override public String getModelId() { return modelId; } - + @PythonMethod @Override public void setModelId(String modelId) { this.modelId = modelId; @@ -263,6 +267,7 @@ public Pipe[] getOutput() { * where the match result is saved * @throws ZinggClientException */ + @PythonMethod @Override public void setOutput(Pipe[] outputDir) throws ZinggClientException { //checkNullBlankEmpty(outputDir, " path for saving results"); @@ -340,6 +345,7 @@ public String getZinggDir() { * @param zinggDir * path to the Zingg directory */ + @PythonMethod @Override public void setZinggDir(String zinggDir) { this.zinggDir = zinggDir; @@ -351,12 +357,13 @@ public void setZinggDir(String zinggDir) { * * @return the path for internal Zingg usage */ - + @PythonMethod @Override @JsonIgnore public String getZinggBaseModelDir(){ return zinggDir + "/" + modelId; } + @PythonMethod @Override @JsonIgnore public String getZinggModelDir() { @@ -386,6 +393,7 @@ public String getZinggDataDocFile() { * * @return the path for internal Zingg usage */ + @PythonMethod @Override @JsonIgnore public String getZinggBaseTrainingDataDir() { @@ -399,6 +407,7 @@ public String getZinggBaseTrainingDataDir() { * * @return the path for internal Zingg usage */ + @PythonMethod @Override @JsonIgnore public String getZinggTrainingDataUnmarkedDir() { @@ -410,6 +419,7 @@ public String getZinggTrainingDataUnmarkedDir() { * * @return the path for internal Zingg usage */ + @PythonMethod @Override @JsonIgnore public String getZinggTrainingDataMarkedDir() { @@ -478,7 +488,7 @@ public void setCollectMetrics(boolean collectMetrics) { public float getStopWordsCutoff() { return stopWordsCutoff; } - + @PythonMethod @Override public void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException { if (stopWordsCutoff > 1 || stopWordsCutoff < 0) @@ -500,7 +510,7 @@ public void setShowConcise(boolean showConcise) { public String getColumn() { return column; } - + @PythonMethod @Override public void setColumn(String column) { this.column = column; diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index 314c6d868..f850fce8a 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -22,6 +22,9 @@ import com.fasterxml.jackson.databind.deser.std.StdDeserializer; import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import zingg.common.py.annotations.PythonClass; +import zingg.common.py.annotations.PythonMethod; + /** * This class defines each field that we use in matching We can use this to @@ -30,6 +33,7 @@ * @author sgoyal * */ +@PythonClass public class FieldDefinition implements Serializable { @@ -52,7 +56,7 @@ public FieldDefinition() { } public String getFields() { return fields; } - + @PythonMethod public void setFields(String fields) { this.fields = fields;} /** @@ -71,6 +75,7 @@ public List getMatchType() { * @param type * the type to set */ + @PythonMethod @JsonDeserialize(using = MatchTypeDeserializer.class) public void setMatchType(List type) { this.matchType = type; //MatchTypeDeserializer.getMatchTypeFromString(type); @@ -98,7 +103,7 @@ public void setDataType(String d) { public String getStopWords() { return stopWords; } - + @PythonMethod public void setStopWords(String stopWords) { this.stopWords = stopWords; } @@ -115,6 +120,7 @@ public String getFieldName() { return fieldName; } + @PythonMethod public void setFieldName(String fieldName) { this.fieldName = fieldName; } diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index d0d51c9c7..e15c0ffee 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -1,5 +1,8 @@ package zingg.common.py.processors; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -17,7 +20,6 @@ @SupportedAnnotationTypes("zingg.common.py.annotations.PythonClass") public class PythonClassProcessor extends AbstractProcessor { - private boolean importsAndDeclarationsGenerated = false; private Map> classMethodsMap = new HashMap<>(); @Override @@ -28,12 +30,6 @@ public synchronized void init(ProcessingEnvironment processingEnv) { @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { - // Imports and global declarations - if (!importsAndDeclarationsGenerated) { - generateImportsAndDeclarations(); - importsAndDeclarationsGenerated = true; - } - // process Services annotation for (Element element : roundEnv.getElementsAnnotatedWith(PythonClass.class)) { @@ -41,29 +37,26 @@ public boolean process(Set annotations, RoundEnvironment TypeElement classElement = (TypeElement) element; PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); List methodNames = new ArrayList<>(); + + try (FileWriter fileWriter = new FileWriter("python/zingg"+ File.separator + element.getSimpleName() + "Generated.py")) { + generateImportsAndDeclarations(element, fileWriter); - System.out.println("class " + element.getSimpleName() + ":"); + fileWriter.write("class " + element.getSimpleName() + ":\n"); - // __init__ method - System.out.println(" def __init__(self" + - generateConstructorParameters(classElement) + "):"); - generateClassInitializationCode(classElement, element); + // __init__ method + fileWriter.write(" def __init__(self" + generateConstructorParameters(classElement, element) + "):\n"); + generateClassInitializationCode(classElement, element, fileWriter); - // for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { - // if (!field.getSimpleName().contentEquals("serialVersionUID")) { - // generateFieldInitializationCode(field, element); - // } - // } - - for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { - if (methodElement.getAnnotation(PythonMethod.class) != null) { - methodNames.add(methodElement.getSimpleName().toString()); + for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { + if (methodElement.getAnnotation(PythonMethod.class) != null) { + methodNames.add(methodElement.getSimpleName().toString()); + } } + classMethodsMap.put(element.getSimpleName().toString(), methodNames); + } catch (IOException e) { + e.printStackTrace(); } - classMethodsMap.put(element.getSimpleName().toString(), methodNames); } - System.out.println(); - // rest of generated class contents } ProcessorContext processorContext = ProcessorContext.getInstance(); processorContext.getClassMethodsMap().putAll(classMethodsMap); @@ -75,21 +68,39 @@ Map> getClassMethodsMap() { return classMethodsMap; } - private void generateImportsAndDeclarations() { - System.out.println("import logging"); - System.out.println("from zingg.client import *"); - System.out.println("LOG = logging.getLogger(\"zingg.pipes\")"); - System.out.println(); - System.out.println("JPipe = getJVM().zingg.spark.client.pipe.SparkPipe"); - System.out.println("FilePipe = getJVM().zingg.common.client.pipe.FilePipe"); - System.out.println("JStructType = getJVM().org.apache.spark.sql.types.StructType"); - System.out.println(); + private void generateImportsAndDeclarations(Element element, FileWriter fileWriter) throws IOException { + fileWriter.write("from zingg.otherThanGenerated import *\n"); + if (element.getSimpleName().contentEquals("Pipe")) { + fileWriter.write("import logging\n"); + fileWriter.write("LOG = logging.getLogger(\"zingg.pipes\")\n"); + fileWriter.write("\n"); + fileWriter.write("JPipe = getJVM().zingg.spark.client.pipe.SparkPipe\n"); + fileWriter.write("FilePipe = getJVM().zingg.common.client.pipe.FilePipe\n"); + fileWriter.write("JStructType = getJVM().org.apache.spark.sql.types.StructType\n"); + fileWriter.write("\n"); + } } - private void generateClassInitializationCode(TypeElement classElement, Element element) { + private void generateClassInitializationCode(TypeElement classElement, Element element, FileWriter fileWriter) throws IOException { if (element.getSimpleName().contentEquals("Pipe")) { - System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()\n"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setName(name)\n"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFormat(format)\n"); + } + else if (element.getSimpleName().contentEquals("Arguments")) { + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n"); + } + else if (element.getSimpleName().contentEquals("FieldDefinition")) { + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.FieldDefinition()\n"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFieldName(name)\n"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setDataType(self.stringify(dataType))\n"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setMatchType(matchType)\n"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFields(name)\n"); + fileWriter.write("\n"); + fileWriter.write(" def getFieldDefinition(self):\n"); + fileWriter.write(" return self.fielddefinition\n"); } + fileWriter.write("\n"); } // private void generateFieldInitializationCode(VariableElement field, Element element) { @@ -101,18 +112,32 @@ private void generateClassInitializationCode(TypeElement classElement, Element e // } // } - private String generateConstructorParameters(TypeElement classElement) { - StringBuilder parameters = new StringBuilder(); - List fields = ElementFilter.fieldsIn(classElement.getEnclosedElements()); + private String generateConstructorParameters(TypeElement classElement, Element element) { - fields = fields.stream() - .filter(field -> !field.getSimpleName().contentEquals("serialVersionUID")) - .filter(this::isFieldForConstructor) - .collect(Collectors.toList()); + StringBuilder parameters = new StringBuilder(); - for (VariableElement field : fields) { - parameters.append(", "); - parameters.append(field.getSimpleName()); + if (element.getSimpleName().contentEquals("Arguments")) { + // For the "Arguments" class, no constructor parameters are needed + return ""; + } + else if (element.getSimpleName().contentEquals("Pipe")) { + parameters.append(", name, format"); + } + else if (element.getSimpleName().contentEquals("FieldDefinition")) { + parameters.append(", name, dataType, *matchType"); + } + else { + List fields = ElementFilter.fieldsIn(classElement.getEnclosedElements()); + + fields = fields.stream() + .filter(field -> !field.getSimpleName().contentEquals("serialVersionUID")) + .filter(this::isFieldForConstructor) + .collect(Collectors.toList()); + + for (VariableElement field : fields) { + parameters.append(", "); + parameters.append(field.getSimpleName()); + } } return parameters.toString(); } diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 183b6458d..6de712703 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -1,5 +1,8 @@ package zingg.common.py.processors; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; import java.util.List; import java.util.Map; @@ -7,7 +10,6 @@ import javax.lang.model.type.TypeMirror; import javax.lang.model.type.TypeKind; import java.util.Set; -// import java.util.logging.Logger; import javax.lang.model.element.*; import zingg.common.py.annotations.*; @@ -16,16 +18,13 @@ public class PythonMethodProcessor extends AbstractProcessor { private Map> classMethodsMap; - // private static final Logger LOG = Logger.getLogger(PythonMethodProcessor.class.getName()); @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { ProcessorContext processorContext = ProcessorContext.getInstance(); classMethodsMap = processorContext.getClassMethodsMap(); - // LOG.info("Processing PythonMethod annotations..."); - // process Services annotation for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { if (element.getKind() == ElementKind.METHOD) { @@ -36,17 +35,18 @@ public boolean process(Set annotations, RoundEnvironment List methodNames = classMethodsMap.get(className); if (methodNames.contains(methodElement.getSimpleName().toString())) { - // LOG.info("Generating Python method for: " + methodElement.getSimpleName()); - System.out.println(" def " + methodElement.getSimpleName() + "(self" + - generateMethodSignature(methodElement) + "):"); - generateMethodReturn(methodElement); - generateFieldAssignment(methodElement); + try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) { + fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n"); + generateMethodReturn(methodElement, fileWriter); + generateFieldAssignment(methodElement, fileWriter); + fileWriter.write("\n"); + } catch (IOException e) { + e.printStackTrace(); + } } } } - System.out.println(); } - // LOG.info("Processing complete."); return false; } @@ -65,7 +65,7 @@ private String generateMethodParameters(ExecutableElement methodElement) { return parameters.toString(); } - private void generateMethodReturn(ExecutableElement methodElement) { + private void generateMethodReturn(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { TypeMirror returnType = methodElement.getReturnType(); if (returnType.getKind() == TypeKind.VOID) { return; @@ -73,7 +73,7 @@ private void generateMethodReturn(ExecutableElement methodElement) { String returnTypeString = resolveType(returnType); String methodName = methodElement.getSimpleName().toString(); String className = methodElement.getEnclosingElement().getSimpleName().toString(); - System.out.println(" return self." + className.toLowerCase() + "." + methodName + "()"); + fileWriter.write(" return self." + className.toLowerCase() + "." + methodName + "()\n"); } } @@ -81,7 +81,7 @@ private String resolveType(TypeMirror typeMirror) { return typeMirror.toString(); } - private void generateFieldAssignment(ExecutableElement methodElement) { + private void generateFieldAssignment(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { List parameters = methodElement.getParameters(); if (!parameters.isEmpty()) { @@ -95,7 +95,7 @@ private void generateFieldAssignment(ExecutableElement methodElement) { } parameterList.append(parameter.getSimpleName()); } - System.out.println(" self." + className.toLowerCase() + "." + methodName + "(" + parameterList + ")"); + fileWriter.write(" self." + className.toLowerCase() + "." + methodName + "(" + parameterList + ")\n"); } } diff --git a/examples/febrl/GeneratedFebrlExample.py b/examples/febrl/GeneratedFebrlExample.py new file mode 100644 index 000000000..54c64e77e --- /dev/null +++ b/examples/febrl/GeneratedFebrlExample.py @@ -0,0 +1,46 @@ +from zingg.ArgumentsGenerated import * +from zingg.FieldDefinitionGenerated import * +from zingg.PipeGenerated import * +from zingg.otherThanGenerated import * +from zingg.otherThanGeneratedPipe import * +from zingg.otherThanGeneratedArguments import * +from zingg.otherThanGeneratedFieldDefinition import * + +#build the arguments for zingg +args = ExtendedArgumentsGenerated() +#set field definitions +fname = ExtendedFieldDefinitionGenerated("fname", "string", MatchType.FUZZY) +lname = ExtendedFieldDefinitionGenerated("lname", "string", MatchType.FUZZY) +stNo = ExtendedFieldDefinitionGenerated("stNo", "string", MatchType.FUZZY) +add1 = ExtendedFieldDefinitionGenerated("add1","string", MatchType.FUZZY) +add2 = ExtendedFieldDefinitionGenerated("add2", "string", MatchType.FUZZY) +city = ExtendedFieldDefinitionGenerated("city", "string", MatchType.FUZZY) +areacode = ExtendedFieldDefinitionGenerated("areacode", "string", MatchType.FUZZY) +state = ExtendedFieldDefinitionGenerated("state", "string", MatchType.FUZZY) +dob = ExtendedFieldDefinitionGenerated("dob", "string", MatchType.FUZZY) +ssn = ExtendedFieldDefinitionGenerated("ssn", "string", MatchType.FUZZY) + +fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn] + +args.setFieldDefinition(fieldDefs) +#set the modelid and the zingg dir +args.setModelId("0102") +args.setZinggDir("models") +args.setNumPartitions(4) +args.setLabelDataSampleSize(0.5) + +#reading dataset into inputPipe and settint it up in 'args' +#below line should not be required if you are reading from in memory dataset +#in that case, replace df with input df +schema = "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" +inputPipe = CsvPipe("testFebrl", "examples/febrl/test.csv", schema) +args.setData(inputPipe) +outputPipe = CsvPipe("resultFebrl", "/tmp/febrlOutput") + +args.setOutput(outputPipe) + +options = ClientOptions([ClientOptions.PHASE,"findTrainingData"]) + +#Zingg execution for the given phase +zingg = Zingg(args, options) +zingg.initAndExecute() \ No newline at end of file diff --git a/python/zingg/ArgumentsGenerated.py b/python/zingg/ArgumentsGenerated.py new file mode 100644 index 000000000..e03fc961c --- /dev/null +++ b/python/zingg/ArgumentsGenerated.py @@ -0,0 +1,44 @@ +from zingg.otherThanGenerated import * +class Arguments: + def __init__(self): + self.arguments = getJVM().zingg.common.client.Arguments() + + def setNumPartitions(self, numPartitions): + self.arguments.setNumPartitions(numPartitions) + + def setLabelDataSampleSize(self, labelDataSampleSize): + self.arguments.setLabelDataSampleSize(labelDataSampleSize) + + def getModelId(self): + return self.arguments.getModelId() + + def setModelId(self, modelId): + self.arguments.setModelId(modelId) + + def setOutput(self, outputDir): + self.arguments.setOutput(outputDir) + + def setZinggDir(self, zinggDir): + self.arguments.setZinggDir(zinggDir) + + def getZinggBaseModelDir(self): + return self.arguments.getZinggBaseModelDir() + + def getZinggModelDir(self): + return self.arguments.getZinggModelDir() + + def getZinggBaseTrainingDataDir(self): + return self.arguments.getZinggBaseTrainingDataDir() + + def getZinggTrainingDataUnmarkedDir(self): + return self.arguments.getZinggTrainingDataUnmarkedDir() + + def getZinggTrainingDataMarkedDir(self): + return self.arguments.getZinggTrainingDataMarkedDir() + + def setStopWordsCutoff(self, stopWordsCutoff): + self.arguments.setStopWordsCutoff(stopWordsCutoff) + + def setColumn(self, column): + self.arguments.setColumn(column) + diff --git a/python/zingg/FieldDefinitionGenerated.py b/python/zingg/FieldDefinitionGenerated.py new file mode 100644 index 000000000..4713552bd --- /dev/null +++ b/python/zingg/FieldDefinitionGenerated.py @@ -0,0 +1,24 @@ +from zingg.otherThanGenerated import * +class FieldDefinition: + def __init__(self, name, dataType, *matchType): + self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() + self.fielddefinition.setFieldName(name) + self.fielddefinition.setDataType(self.stringify(dataType)) + self.fielddefinition.setMatchType(matchType) + self.fielddefinition.setFields(name) + + def getFieldDefinition(self): + return self.fielddefinition + + def setFields(self, fields): + self.fielddefinition.setFields(fields) + + def setMatchType(self, type): + self.fielddefinition.setMatchType(type) + + def setStopWords(self, stopWords): + self.fielddefinition.setStopWords(stopWords) + + def setFieldName(self, fieldName): + self.fielddefinition.setFieldName(fieldName) + diff --git a/python/zingg/PipeGenerated.py b/python/zingg/PipeGenerated.py new file mode 100644 index 000000000..6144a7177 --- /dev/null +++ b/python/zingg/PipeGenerated.py @@ -0,0 +1,35 @@ +from zingg.otherThanGenerated import * +import logging +LOG = logging.getLogger("zingg.pipes") + +JPipe = getJVM().zingg.spark.client.pipe.SparkPipe +FilePipe = getJVM().zingg.common.client.pipe.FilePipe +JStructType = getJVM().org.apache.spark.sql.types.StructType + +class Pipe: + def __init__(self, name, format): + self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe() + self.pipe.setName(name) + self.pipe.setFormat(format) + + def setSchema(self, schema): + self.pipe.setSchema(schema) + + def getName(self): + return self.pipe.getName() + + def setName(self, name): + self.pipe.setName(name) + + def getFormat(self): + return self.pipe.getFormat() + + def setFormat(self, sinkType): + self.pipe.setFormat(sinkType) + + def setProp(self, k, v): + self.pipe.setProp(k, v) + + def toString(self): + return self.pipe.toString() + diff --git a/python/zingg/otherThanGenerated.py b/python/zingg/otherThanGenerated.py new file mode 100644 index 000000000..af1f1c710 --- /dev/null +++ b/python/zingg/otherThanGenerated.py @@ -0,0 +1,510 @@ +""" +zingg.client +------------------------ +This module is the main entry point of the Zingg Python API +""" + +import logging +import argparse +import pandas as pd +from pyspark.sql import DataFrame + +from pyspark import SparkConf, SparkContext, SQLContext + +from py4j.java_collections import SetConverter, MapConverter, ListConverter + +from pyspark.sql import SparkSession +import os + +LOG = logging.getLogger("zingg") + +_spark_ctxt = None +_sqlContext = None +_spark = None +_zingg_jar = 'zingg-0.4.1-SNAPSHOT.jar' + +def initSparkClient(): + global _spark_ctxt + global _sqlContext + global _spark + _spark_ctxt = SparkContext.getOrCreate() + _sqlContext = SQLContext(_spark_ctxt) + _spark = SparkSession.builder.getOrCreate() + return 1 + +def initDataBricksConectClient(): + global _spark_ctxt + global _sqlContext + global _spark + jar_path = os.getenv('ZINGG_HOME')+'/'+_zingg_jar + _spark = SparkSession.builder.config('spark.jars', jar_path).getOrCreate() + _spark_ctxt = _spark.sparkContext + _sqlContext = SQLContext(_spark_ctxt) + return 1 + +def initClient(): + global _spark_ctxt + global _sqlContext + global _spark + if _spark_ctxt is None: + DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT') + if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y': + return initDataBricksConectClient() + else: + return initSparkClient() + else: + return 1 + +def getSparkContext(): + if _spark_ctxt is None: + initClient() + return _spark_ctxt + +def getSparkSession(): + if _spark is None: + initClient() + return _spark + +def getSqlContext(): + if _sqlContext is None: + initClient() + return _sqlContext + +def getJVM(): + return getSparkContext()._jvm + +def getGateway(): + return getSparkContext()._gateway + +ColName = getJVM().zingg.common.client.util.ColName +MatchType = getJVM().zingg.common.client.MatchType +ClientOptions = getJVM().zingg.common.client.ClientOptions +ZinggOptions = getJVM().zingg.common.client.ZinggOptions +LabelMatchType = getJVM().zingg.common.core.util.LabelMatchType +UpdateLabelMode = 'Overwrite' + +def getDfFromDs(data): + """ Method to convert spark dataset to dataframe + + :param data: provide spark dataset + :type data: DataSet + :return: converted spark dataframe + :rtype: DataFrame + """ + return DataFrame(data.df(), getSqlContext()) + +def getPandasDfFromDs(data): + """ Method to convert spark dataset to pandas dataframe + + :param data: provide spark dataset + :type data: DataSet + :return: converted pandas dataframe + :rtype: DataFrame + """ + df = getDfFromDs(data) + return pd.DataFrame(df.collect(), columns=df.columns) + +class Zingg: + """ This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. + + :param args: arguments for training and matching + :type args: Arguments + :param options: client option for this class object + :type options: ClientOptions + + """ + + def __init__(self, args, options): + self.inpArgs = args + self.inpOptions = options + self.client = getJVM().zingg.spark.client.SparkClient(args.getArgs(), options.getClientOptions()) + + def init(self): + """ Method to initialize zingg client by reading internal configurations and functions """ + self.client.init() + + def execute(self): + """ Method to execute this class object """ + self.client.execute() + + def initAndExecute(self): + """ Method to run both init and execute methods consecutively """ + self.client.init() + DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT') + if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y': + options = self.client.getOptions() + inpPhase = options.get(ClientOptions.PHASE).getValue() + if (inpPhase==ZinggOptions.LABEL.getValue()): + self.executeLabel() + elif (inpPhase==ZinggOptions.UPDATE_LABEL.getValue()): + self.executeLabelUpdate() + else: + self.client.execute() + else: + self.client.execute() + + def executeLabel(self): + """ Method to run label phase """ + self.client.getTrainingDataModel().setMarkedRecordsStat(self.getMarkedRecords()) + unmarkedRecords = self.getUnmarkedRecords() + updatedRecords = self.processRecordsCli(unmarkedRecords,self.inpArgs) + self.writeLabelledOutput(updatedRecords,self.inpArgs) + + def executeLabelUpdate(self): + """ Method to run label update phase """ + self.processRecordsCliLabelUpdate(self.getMarkedRecords(),self.inpArgs) + + def getMarkedRecords(self): + """ Method to get marked record dataset from the inputpipe + + :return: spark dataset containing marked records + :rtype: Dataset + """ + return self.client.getMarkedRecords() + + def getUnmarkedRecords(self): + """ Method to get unmarked record dataset from the inputpipe + + :return: spark dataset containing unmarked records + :rtype: Dataset + """ + return self.client.getUnmarkedRecords() + + def processRecordsCli(self,unmarkedRecords,args): + """ Method to get user input on unmarked records + + :return: spark dataset containing updated records + :rtype: Dataset + """ + trainingDataModel = self.client.getTrainingDataModel() + labelDataViewHelper = self.client.getLabelDataViewHelper() + + if unmarkedRecords is not None and unmarkedRecords.count() > 0: + labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) + unmarkedRecords = unmarkedRecords.cache() + displayCols = labelDataViewHelper.getDisplayColumns(unmarkedRecords, args.getArgs()) + clusterIdZFrame = labelDataViewHelper.getClusterIdsFrame(unmarkedRecords) + clusterIDs = labelDataViewHelper.getClusterIds(clusterIdZFrame) + totalPairs = clusterIDs.size() + updatedRecords = None + for index in range(totalPairs): + currentPair = labelDataViewHelper.getCurrentPair(unmarkedRecords, index, clusterIDs, clusterIdZFrame) + + score = labelDataViewHelper.getScore(currentPair) + prediction = labelDataViewHelper.getPrediction(currentPair) + + msg1 = labelDataViewHelper.getMsg1(index, totalPairs) + msg2 = labelDataViewHelper.getMsg2(prediction, score) + labelDataViewHelper.displayRecords(labelDataViewHelper.getDSUtil().select(currentPair, displayCols), msg1, msg2) + selected_option = input() + while int(selected_option) not in [0,1,2,9]: + print('Please enter valid option') + selected_option = input("Enter choice: ") + if int(selected_option) == 9: + print("User has quit in the middle. Updating the records.") + break + trainingDataModel.updateLabellerStat(int(selected_option), 1) + labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) + updatedRecords = trainingDataModel.updateRecords(int(selected_option), currentPair, updatedRecords) + print("Processing finished.") + return updatedRecords + else: + print("It seems there are no unmarked records at this moment. Please run findTrainingData job to build some pairs to be labelled and then run this labeler.") + return None + + def processRecordsCliLabelUpdate(self,lines,args): + trainingDataModel = self.client.getTrainingDataModel() + labelDataViewHelper = self.client.getLabelDataViewHelper() + if (lines is not None and lines.count() > 0): + trainingDataModel.setMarkedRecordsStat(lines) + labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) + displayCols = labelDataViewHelper.getDSUtil().getFieldDefColumns(lines, args.getArgs(), False, args.getArgs().getShowConcise()) + updatedRecords = None + recordsToUpdate = lines + selectedOption = -1 + + while (str(selectedOption) != '9'): + cluster_id = input("\n\tPlease enter the cluster id (or 9 to exit): ") + if str(cluster_id) == '9': + print("User has exit in the middle. Updating the records.") + break + currentPair = lines.filter(lines.equalTo(ColName.CLUSTER_COLUMN, cluster_id)) + if currentPair.isEmpty(): + print("\tInvalid cluster id. Enter '9' to exit") + continue + + matchFlag = currentPair.getAsInt(currentPair.head(),ColName.MATCH_FLAG_COL) + preMsg = "\n\tThe record pairs belonging to the input cluster id "+cluster_id+" are:" + postMsg = "\tThe above pair is labeled as "+str(matchFlag)+"\n" + labelDataViewHelper.displayRecords(labelDataViewHelper.getDSUtil().select(currentPair, displayCols), preMsg, postMsg) + selectedOption = input() + trainingDataModel.updateLabellerStat(int(selectedOption), 1) + trainingDataModel.updateLabellerStat(matchFlag, -1) + labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) + + if (str(selectedOption) == '9'): + print("User has quit in the middle. Updating the records.") + break + + recordsToUpdate = recordsToUpdate.filter(recordsToUpdate.notEqual(ColName.CLUSTER_COLUMN,cluster_id)) + + if (updatedRecords is not None): + updatedRecords = updatedRecords.filter(updatedRecords.notEqual(ColName.CLUSTER_COLUMN,cluster_id)) + + updatedRecords = trainingDataModel.updateRecords(int(selectedOption), currentPair, updatedRecords) + + if updatedRecords is not None: + updatedRecords = updatedRecords.union(recordsToUpdate) + + outPipe = trainingDataModel.getOutputPipe(args.getArgs()) + outPipe.setMode(UpdateLabelMode) + + trainingDataModel.writeLabelledOutput(updatedRecords,args.getArgs(),outPipe) + print("Processing finished.") + return updatedRecords + else: + print("There is no marked record for updating. Please run findTrainingData/label jobs to generate training data.") + return None + + + def writeLabelledOutput(self,updatedRecords,args): + """ Method to write updated records after user input + """ + trainingDataModel = self.client.getTrainingDataModel() + if updatedRecords is not None: + trainingDataModel.writeLabelledOutput(updatedRecords,args.getArgs()) + + def writeLabelledOutputFromPandas(self,candidate_pairs_pd,args): + """ Method to write updated records (as pandas df) after user input + """ + markedRecordsAsDS = (getSparkSession().createDataFrame(candidate_pairs_pd))._jdf + # pands df gives z_isMatch as long so needs to be cast + markedRecordsAsDS = markedRecordsAsDS.withColumn(ColName.MATCH_FLAG_COL,markedRecordsAsDS.col(ColName.MATCH_FLAG_COL).cast("int")) + updatedRecords = getJVM().zingg.spark.client.SparkFrame(markedRecordsAsDS) + self.writeLabelledOutput(updatedRecords,args) + + def setArguments(self, args): + """ Method to set Arguments + + :param args: provide arguments for this class object + :type args: Arguments + """ + self.client.setArguments() + + def getArguments(self): + """ Method to get atguments of this class object + + :return: The pointer containing address of the Arguments object of this class object + :rtype: pointer(Arguments) + """ + return self.client.getArguments() + + def getOptions(self): + """ Method to get client options of this class object + + :return: The pointer containing the address of the ClientOptions object of this class object + :rtype: pointer(ClientOptions) + """ + return self.client.getOptions() + + def setOptions(self, options): + """ Method to set atguments of this class object + + :param options: provide client options for this class object + :type options: ClientOptions + :return: The pointer containing address of the ClientOptions object of this class object + :rtype: pointer(ClientOptions) + """ + return self.client.setOptions(options) + + def getMarkedRecordsStat(self, markedRecords, value): + """ Method to get No. of records that is marked + + :param markedRecords: spark dataset containing marked records + :type markedRecords: Dataset + :param value: flag value to check if markedRecord is initially matched or not + :type value: long + :return: The no. of marked records + :rtype: int + """ + return self.client.getMarkedRecordsStat(markedRecords, value) + + def getMatchedMarkedRecordsStat(self): + """ Method to get No. of records that are marked and matched + + :return: The bo. of matched marked records + :rtype: int + """ + return self.client.getMatchedMarkedRecordsStat(self.getMarkedRecords()) + + def getUnmatchedMarkedRecordsStat(self): + """ Method to get No. of records that are marked and unmatched + + :return: The no. of unmatched marked records + :rtype: int + """ + return self.client.getUnmatchedMarkedRecordsStat(self.getMarkedRecords()) + + def getUnsureMarkedRecordsStat(self): + """ Method to get No. of records that are marked and Not Sure if its matched or not + + :return: The no. of Not Sure marked records + :rtype: int + """ + return self.client.getUnsureMarkedRecordsStat(self.getMarkedRecords()) + + + +class ZinggWithSpark(Zingg): + + """ This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. + + :param args: arguments for training and matching + :type args: Arguments + :param options: client option for this class object + :type options: ClientOptions + + """ + + def __init__(self, args, options): + self.client = getJVM().zingg.spark.client.SparkClient(args.getArgs(), options.getClientOptions(), getSparkSession()._jsparkSession) + +class ClientOptions: + """ Class that contains Client options for Zingg object + :param phase: trainMatch, train, match, link, findAndLabel, findTrainingData, recommend etc + :type phase: String + :param args: Parse a list of Zingg command line options parameter values e.g. "--location" etc. optional argument for initializing this class. + :type args: List(String) or None + """ + PHASE = getJVM().zingg.common.client.ClientOptions.PHASE + """:PHASE: phase parameter for this class""" + CONF = getJVM().zingg.common.client.ClientOptions.CONF + """:CONF: conf parameter for this class""" + LICENSE = getJVM().zingg.common.client.ClientOptions.LICENSE + """:LICENSE: license parameter for this class""" + EMAIL = getJVM().zingg.common.client.ClientOptions.EMAIL + """:EMAIL: e-mail parameter for this class""" + LOCATION = getJVM().zingg.common.client.ClientOptions.LOCATION + """:LOCATION: location parameter for this class""" + REMOTE = getJVM().zingg.common.client.ClientOptions.REMOTE + """:REMOTE: remote option used internally for running on Databricks""" + ZINGG_DIR = getJVM().zingg.common.client.ClientOptions.ZINGG_DIR + """:ZINGG_DIR: location where Zingg saves the model, training data etc""" + MODEL_ID = getJVM().zingg.common.client.ClientOptions.MODEL_ID + """:MODEL_ID: ZINGG_DIR/MODEL_ID is used to save the model""" + COLUMN = getJVM().zingg.common.client.ClientOptions.COLUMN + """:COLUMN: Column whose stop words are to be recommended through Zingg""" + + def __init__(self, argsSent=None): + print(argsSent) + if(argsSent == None): + args = [] + else: + args = argsSent.copy() + if (not (self.PHASE in args)): + args.append(self.PHASE) + args.append("peekModel") + if (not (self.LICENSE in args)): + args.append(self.LICENSE) + args.append("zinggLic.txt") + if (not (self.EMAIL in args)): + args.append(self.EMAIL) + args.append("zingg@zingg.ai") + if (not (self.CONF in args)): + args.append(self.CONF) + args.append("dummyConf.json") + print("arguments for client options are ", args) + self.co = getJVM().zingg.common.client.ClientOptions(args) + + + def getClientOptions(self): + """ Method to get pointer address of this class + + :return: The pointer containing address of the this class object + :rtype: pointer(ClientOptions) + """ + return self.co + + def getOptionValue(self, option): + """ Method to get value for the key option + + :param option: key to geting the value + :type option: String + :return: The value which is mapped for given key + :rtype: String + """ + return self.co.getOptionValue(option) + + def setOptionValue(self, option, value): + """ Method to map option key to the given value + + :param option: key that is mapped with value + :type option: String + :param value: value to be set for given key + :type value: String + """ + self.co.get(option).setValue(value) + + def getPhase(self): + """ Method to get PHASE value + + :return: The PHASE parameter value + :rtype: String + """ + return self.co.get(ClientOptions.PHASE).getValue() + + def setPhase(self, newValue): + """ Method to set PHASE value + + :param newValue: name of the phase + :type newValue: String + :return: The pointer containing address of the this class object after seting phase + :rtype: pointer(ClientOptions) + """ + self.co.get(ClientOptions.PHASE).setValue(newValue) + + def getConf(self): + """ Method to get CONF value + + :return: The CONF parameter value + :rtype: String + """ + return self.co.get(ClientOptions.CONF).getValue() + + def hasLocation(self): + """ Method to check if this class has LOCATION parameter set as None or not + + :return: The boolean value if LOCATION parameter is present or not + :rtype: Bool + """ + if(self.co.get(ClientOptions.LOCATION)==None): + return False + else: + return True + + def getLocation(self): + """ Method to get LOCATION value + + :return: The LOCATION parameter value + :rtype: String + """ + return self.co.get(ClientOptions.LOCATION).getValue() + +def parseArguments(argv): + """ This method is used for checking mandatory arguments and creating an arguments list from Command line arguments + + :param argv: Values that are passed during the calling of the program along with the calling statement. + :type argv: List + :return: a list containing necessary arguments to run any phase + :rtype: List + """ + parser = argparse.ArgumentParser(description='Zingg\'s python APIs') + mandatoryOptions = parser.add_argument_group('mandatory arguments') + mandatoryOptions.add_argument('--phase', required=True, + help='python phase e.g. assessModel') + mandatoryOptions.add_argument('--conf', required=True, + help='JSON configuration with data input output locations and field definitions') + + args, remaining_args = parser.parse_known_args(argv) + LOG.debug("args: ", args) + return args \ No newline at end of file diff --git a/python/zingg/otherThanGeneratedArguments.py b/python/zingg/otherThanGeneratedArguments.py new file mode 100644 index 000000000..113d08ead --- /dev/null +++ b/python/zingg/otherThanGeneratedArguments.py @@ -0,0 +1,56 @@ +from zingg.ArgumentsGenerated import * +from zingg.otherThanGeneratedFieldDefinition import * + +class ExtendedArgumentsGenerated(Arguments): + def __init__(self): + super().__init__() + + def setFieldDefinition(self, fieldDef): + javaFieldDef = [] + for f in fieldDef: + javaFieldDef.append(f.getFieldDefinition()) + self.arguments.setFieldDefinition(javaFieldDef) + + def setData(self, *pipes): + dataPipe = getGateway().new_array(getJVM().zingg.common.client.pipe.Pipe, len(pipes)) + for idx, pipe in enumerate(pipes): + dataPipe[idx] = pipe.getPipe() + self.arguments.setData(dataPipe) + + def setOutput(self, *pipes): + outputPipe = getGateway().new_array(getJVM().zingg.common.client.pipe.Pipe, len(pipes)) + for idx, pipe in enumerate(pipes): + outputPipe[idx] = pipe.getPipe() + self.arguments.setOutput(outputPipe) + + def getArgs(self): + return self.arguments + + def setTrainingSamples(self, *pipes): + dataPipe = getGateway().new_array(getJVM().zingg.common.client.pipe.Pipe, len(pipes)) + for idx, pipe in enumerate(pipes): + dataPipe[idx] = pipe.getPipe() + self.arguments.setTrainingSamples(dataPipe) + + def writeArgumentsToJSON(self, fileName): + getJVM().zingg.common.client.ArgumentsUtil().writeArgumentsToJSON(fileName, self.arguments) + + @staticmethod + def createArgumentsFromJSON(fileName, phase): + obj = Arguments() + obj.arguments = getJVM().zingg.common.client.ArgumentsUtil().createArgumentsFromJSON(fileName, phase) + return obj + + def writeArgumentsToJSONString(self): + return getJVM().zingg.common.client.ArgumentsUtil().writeArgumentstoJSONString(self.arguments) + + @staticmethod + def createArgumentsFromJSONString(jsonArgs, phase): + obj = Arguments() + obj.arguments = getJVM().zingg.common.client.ArgumentsUtil().createArgumentsFromJSONString(jsonArgs, phase) + return obj + + def copyArgs(self, phase): + argsString = self.writeArgumentsToJSONString() + return self.createArgumentsFromJSONString(argsString, phase) + \ No newline at end of file diff --git a/python/zingg/otherThanGeneratedFieldDefinition.py b/python/zingg/otherThanGeneratedFieldDefinition.py new file mode 100644 index 000000000..43f3d229e --- /dev/null +++ b/python/zingg/otherThanGeneratedFieldDefinition.py @@ -0,0 +1,20 @@ +from zingg.FieldDefinitionGenerated import * + +class ExtendedFieldDefinitionGenerated(FieldDefinition): + def __init__(self, name, dataType, *matchType): + super().__init__(name, dataType, *matchType) + + def getFieldDefinition(self): + return self.fielddefinition + + # should be stringify'ed before it is set in fd object + def stringify(self, str): + """ Method to stringify'ed the dataType before it is set in FieldDefinition object + + :param str: dataType of the FieldDefinition + :type str: String + :return: The stringify'ed value of the dataType + :rtype: String + """ + + return str \ No newline at end of file diff --git a/python/zingg/otherThanGeneratedPipe.py b/python/zingg/otherThanGeneratedPipe.py new file mode 100644 index 000000000..a46df2794 --- /dev/null +++ b/python/zingg/otherThanGeneratedPipe.py @@ -0,0 +1,228 @@ +from zingg.PipeGenerated import * + +class ExtendedPipeGenerated(Pipe): + def __init__(self, name, format): + super().__init__(name, format) + + def getPipe(self): + return self.pipe + + def addProperty(self, name, value): + """ Method for adding different properties of pipe + + :param name: name of the property + :type name: String + :param value: value you want to set for the property + :type value: String + """ + self.pipe.setProp(name, value) + +class CsvPipe(ExtendedPipeGenerated): + """ Class CsvPipe: used for working with text files which uses a pipe symbol to separate units of text that belong in different columns. + + :param name: name of the pipe. + :type name: String + :param location: (optional) location from where we read data + :type location: String or None + :param schema: (optional) json schema for the pipe + :type schema: Schema or None + """ + def __init__(self, name, location = None, schema = None): + ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_CSV) + if(location != None): + ExtendedPipeGenerated.addProperty(self, FilePipe.LOCATION, location) + if(schema != None): + #df = spark.read.format(JPipe.FORMAT_CSV).schema(schema).load(location) + #s = JStructType.fromDDL(schema) + ExtendedPipeGenerated.setSchema(self, schema) + print("set schema ") + + def setDelimiter(self, delimiter): + """ This method is used to define delimiter of CsvPipe + + :param delimiter: a sequence of one or more characters for specifying the boundary between separate, independent regions in data streams + :type delimiter: String + """ + ExtendedPipeGenerated.addProperty(self, "delimiter", delimiter) + + + def setLocation(self, location): + """ Method to set location of pipe + + :param location: location from where we read data + :type location: String + """ + ExtendedPipeGenerated.addProperty(self, FilePipe.LOCATION, location) + + def setHeader(self, header): + """ Method to set header property of pipe + + :param header: true if pipe have header, false otherwise + :type header: Boolean + """ + ExtendedPipeGenerated.addProperty(self, FilePipe.HEADER, header) + +class BigQueryPipe(ExtendedPipeGenerated): + """ Pipe Class for working with BigQuery pipeline + + :param name: name of the pipe. + :type name: String + """ + + VIEWS_ENABLED = "viewsEnabled" + CREDENTIAL_FILE = "credentialsFile" + TABLE = "table" + TEMP_GCS_BUCKET="temporaryGcsBucket" + + def __init__(self,name): + ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_BIGQUERY) + + def setCredentialFile(self, file): + """ Method to set Credential file to the pipe + + :param file: credential file name + :type file: String + """ + ExtendedPipeGenerated.addProperty(self, "credentialsFile", file) + + def setTable(self, table): + """ Method to set Table to the pipe + + :param table: provide table parameter + :type table: String + """ + ExtendedPipeGenerated.addProperty(self, "table", table) + + def setTemporaryGcsBucket(self, bucket): + """ Method to set TemporaryGcsBucket to the pipe + + :param bucket: provide bucket parameter + :type bucket: String + """ + ExtendedPipeGenerated.addProperty(self, "temporaryGcsBucket", bucket) + + def setViewsEnabled(self, isEnabled): + """ Method to set if viewsEnabled parameter is Enabled or not + + :param isEnabled: provide boolean parameter which defines if viewsEnabled option is enable or not + :type isEnabled: Bool + """ + ExtendedPipeGenerated.addProperty(self, "viewsEnabled", isEnabled) + + +class SnowflakePipe(ExtendedPipeGenerated): + """ Pipe Class for working with Snowflake pipeline + + :param name: name of the pipe + :type name: String + """ + URL = "sfUrl" + USER = "sfUser" + PASSWORD = "sfPassword" + DATABASE ="sfDatabase" + SCHEMA = "sfSchema" + WAREHOUSE = "sfWarehouse" + DBTABLE = "dbtable" + + def __init__(self,name): + ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_SNOWFLAKE) + ExtendedPipeGenerated.addProperty(self, "application", "zinggai_zingg") + + + def setURL(self, url): + """ Method to set url to the pipe + + :param url: provide url for this pipe + :type url: String + """ + ExtendedPipeGenerated.addProperty(self, "sfUrl", url) + + def setUser(self, user): + """ Method to set User to the pipe + + :param user: provide User parameter. + :type user: String + """ + ExtendedPipeGenerated.addProperty(self, "sfUser", user) + + def setPassword(self, passwd): + """ Method to set Password to the pipe + + :param passwd: provide Password parameter. + :type passwd: String + """ + ExtendedPipeGenerated.addProperty(self, "sfPassword", passwd) + + def setDatabase(self, db): + """ Method to set Database to the pipe + + :param db: provide Database parameter. + :type db: Database + """ + ExtendedPipeGenerated.addProperty(self, "sfDatabase", db) + + def setSFSchema(self, schema): + """ Method to set Schema to the pipe + + :param schema: provide schema parameter. + :type schema: Schema + """ + ExtendedPipeGenerated.addProperty(self, "sfSchema", schema) + + def setWarehouse(self, warehouse): + """ Method to set warehouse parameter to the pipe + + :param warehouse: provide warehouse parameter. + :type warehouse: String + """ + ExtendedPipeGenerated.addProperty(self, "sfWarehouse", warehouse) + + def setDbTable(self, dbtable): + """ description + + :param dbtable: provide bucket parameter. + :type dbtable: String + """ + ExtendedPipeGenerated.addProperty(self, "dbtable", dbtable) + + +class InMemoryPipe(ExtendedPipeGenerated): + """ Pipe Class for working with InMemory pipeline + + :param name: name of the pipe + :type name: String + :param df: provide dataset for this pipe (optional) + :type df: Dataset or None + """ + + def __init__(self, name, df = None): + ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_INMEMORY) + if (df is not None): + self.setDataset(df) + + def setDataset(self, df): + """ Method to set DataFrame of the pipe + + :param df: pandas or spark dataframe for the pipe + :type df: DataFrame + """ + if (isinstance(df, pd.DataFrame)): + print('schema of pandas df is ' , ExtendedPipeGenerated.getPipe(self).getSchema()) + if (ExtendedPipeGenerated.getPipe(self).getSchema() is not None): + ds = getSparkSession().createDataFrame(df, schema=ExtendedPipeGenerated.getPipe(self).getSchema()) + else: + ds = getSparkSession().createDataFrame(df) + + ExtendedPipeGenerated.getPipe(self).setDataset(ds._jdf) + elif (isinstance(df, DataFrame)): + ExtendedPipeGenerated.getPipe(self).setDataset(df._jdf) + else: + LOG.error(" setDataset(): NUll or Unsupported type: %s", type(df)) + + def getDataset(self): + """ Method to get Dataset from pipe + + :return: dataset of the pipe in the format of spark dataset + :rtype: Dataset + """ + return ExtendedPipeGenerated.getPipe(self).getDataset().df() \ No newline at end of file From b6d42be809a85c5f675f3af7570e46ba7696246e Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 2 Feb 2024 10:41:45 +0530 Subject: [PATCH 048/219] 1st cut integration tests for OSS --- .../common/core/executor/ExecutorTester.java | 24 ++++++ .../common/core/executor/JunitLabeller.java | 61 ++++++++++++++ .../common/core/executor/LabellerTester.java | 36 ++++++++ .../common/core/executor/MatcherTester.java | 82 ++++++++++++++++++ .../core/executor/TestExecutorsGeneric.java | 83 +++++++++++++++++++ .../common/core/executor/TrainerTester.java | 19 +++++ .../executor/TrainingDataFinderTester.java | 29 +++++++ .../infra/util/PojoToArrayConverter.java | 2 +- 8 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java create mode 100644 common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java create mode 100644 common/core/src/test/java/zingg/common/core/executor/LabellerTester.java create mode 100644 common/core/src/test/java/zingg/common/core/executor/MatcherTester.java create mode 100644 common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java create mode 100644 common/core/src/test/java/zingg/common/core/executor/TrainerTester.java create mode 100644 common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java diff --git a/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java b/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java new file mode 100644 index 000000000..8addea3f8 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java @@ -0,0 +1,24 @@ +package zingg.common.core.executor; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZinggClientException; + +public abstract class ExecutorTester { + + public static final Log LOG = LogFactory.getLog(ExecutorTester.class); + + public ZinggBase executor; + + public ExecutorTester(ZinggBase executor) { + this.executor = executor; + } + + public void execute() throws ZinggClientException { + executor.execute(); + } + + public abstract void validateResults() throws ZinggClientException; + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java b/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java new file mode 100644 index 000000000..8b4a66874 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java @@ -0,0 +1,61 @@ +package zingg.common.core.executor; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.options.ZinggOptions; +import zingg.common.client.util.ColName; +import zingg.common.client.util.ColValues; +import zingg.common.core.context.Context; +import zingg.common.core.executor.Labeller; + +public class JunitLabeller extends Labeller { + + private static final long serialVersionUID = 1L; + + public JunitLabeller(Context context) { + setZinggOption(ZinggOptions.LABEL); + setContext(context); + } + + @Override + public ZFrame processRecordsCli(ZFrame lines) + throws ZinggClientException { + + // now get a list of all those rows which have same cluster and match due to fname => mark match + ZFrame lines2 = getDSUtil().getPrefixedColumnsDS(lines); + + // construct AND condition + C clusterCond = getJoinCondForCol(lines, lines2, ColName.CLUSTER_COLUMN,true); + C fnameCond = getJoinCondForCol(lines, lines2, "FNAME",true); + C idCond = getJoinCondForCol(lines, lines2, "ID",false); + C filterCond = lines2.and(lines2.and(clusterCond,idCond),fnameCond); + + ZFrame filtered = lines.joinOnCol(lines2, filterCond).cache(); + + ZFrame matches = filtered.select(ColName.CLUSTER_COLUMN).distinct().withColumn(ColName.MATCH_FLAG_COL, ColValues.IS_MATCH_PREDICTION).cache(); + + ZFrame nonMatches = lines.select(ColName.CLUSTER_COLUMN).except(matches.select(ColName.CLUSTER_COLUMN)).distinct().withColumn(ColName.MATCH_FLAG_COL, ColValues.IS_NOT_A_MATCH_PREDICTION).cache(); + + ZFrame all = matches.unionAll(nonMatches); + + ZFrame linesMatched = lines; + linesMatched = linesMatched.drop(ColName.MATCH_FLAG_COL); + linesMatched = linesMatched.joinOnCol(all, ColName.CLUSTER_COLUMN); + linesMatched = linesMatched.select(lines.columns()); // make same order + + return linesMatched; + } + + private C getJoinCondForCol(ZFrame df1, ZFrame dfToJoin,String colName, boolean equal) { + C column = df1.col(colName); + C columnWithPrefix = dfToJoin.col(ColName.COL_PREFIX + colName); + C equalTo = df1.equalTo(column,columnWithPrefix); + if (equal) { + return equalTo; + } else { + return df1.not(equalTo); + } + } + + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java b/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java new file mode 100644 index 000000000..d522a26b6 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java @@ -0,0 +1,36 @@ +package zingg.common.core.executor; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.ColName; + +public class LabellerTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(LabellerTester.class); + + public LabellerTester(Labeller executor) { + super(executor); + } + + @Override + public void validateResults() throws ZinggClientException { + // check that marked data has at least 1 match row and 1 unmatch row + ZFrame dfMarked = executor.getContext().getPipeUtil(). + read(false, false, executor.getContext().getPipeUtil().getTrainingDataMarkedPipe(executor.getArgs())); + + C matchCond = dfMarked.equalTo(ColName.MATCH_FLAG_COL, 1); + C notMatchCond = dfMarked.equalTo(ColName.MATCH_FLAG_COL, 0); + + long matchCount = dfMarked.filter(matchCond).count(); + assertTrue(matchCount > 1); + long unmatchCount = dfMarked.filter(notMatchCond).count(); + assertTrue(unmatchCount > 1); + LOG.info("matchCount : "+ matchCount + ", unmatchCount : " + unmatchCount); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java new file mode 100644 index 000000000..7ee3eda04 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java @@ -0,0 +1,82 @@ +package zingg.common.core.executor; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.ColName; + +public abstract class MatcherTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(MatcherTester.class); + + public MatcherTester(Matcher executor) { + super(executor); + } + + @Override + public void validateResults() throws ZinggClientException { + assessAccuracy(); + } + + public String getClusterColName() { + return ColName.CLUSTER_COLUMN; + } + + protected void assessAccuracy() throws ZinggClientException { + ZFrame df = getOutputData(); + + df = df.withColumn("fnameId",df.concat(df.col("fname"), df.col("id"))); + df = df.select("fnameId", getClusterColName()); + df = df.withColumn("dupeFnameId",substr(df,df.col("fnameId"),0,8)).cache(); + ZFrame df1 = df.withColumnRenamed("fnameId", "fnameId1").withColumnRenamed("dupeFnameId", "dupeFnameId1") + .withColumnRenamed(getClusterColName(), getClusterColName() + "1").cache(); + + + ZFrame gold = joinAndFilter("dupeFnameId", df, df1).cache(); + ZFrame result = joinAndFilter(getClusterColName(), df, df1).cache(); + + ZFrame fn = gold.except(result); + ZFrame tp = intersect(gold,result); + ZFrame fp = result.except(gold); + + long fnCount = fn.count(); + long tpCount = tp.count(); + long fpCount = fp.count(); + + LOG.info("False negative " + fnCount); + LOG.info("True positive " + tpCount); + LOG.info("False positive " + fpCount); + LOG.info("precision " + (tpCount*1.0d/(tpCount+fpCount))); + LOG.info("recall " + tpCount + " denom " + (tpCount+fnCount) + " overall " + (tpCount*1.0d/(tpCount+fnCount))); + + assertTrue(0.8 < (tpCount*1.0d/(tpCount+fpCount))); + assertTrue(0.8 < (tpCount*1.0d/(tpCount+fnCount))); + } + + public ZFrame getOutputData() throws ZinggClientException { + ZFrame output = executor.getContext().getPipeUtil().read(false, false, executor.getArgs().getOutput()[0]); + return output; + } + + protected ZFrame joinAndFilter(String colName, ZFrame df, ZFrame df1){ + C col1 = df.col(colName); + C col2 = df1.col(colName+"1"); + ZFrame joined = df.joinOnCol(df1, df.equalTo(col1, col2)); + return joined.filter(gt(joined, joined.col("fnameId"), joined.col("fnameId1"))); + } + + + // returns df1.intersect(df2) + public abstract ZFrame intersect(ZFrame df1, ZFrame df2); + + // return df1.substr(col,startPos,len) + public abstract C substr(ZFrame df1, C col, int startPos, int len); + + // return df1.gt(c1, c2) + public abstract C gt(ZFrame df1, C column1, C column2); + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java new file mode 100644 index 000000000..7625e596e --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java @@ -0,0 +1,83 @@ +package zingg.common.core.executor; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ArgumentsUtil; +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; + +public abstract class TestExecutorsGeneric { + + public static final Log LOG = LogFactory.getLog(TestExecutorsGeneric.class); + + protected IArguments args; + + + protected S session; + + protected final String PARENT_CONFIG_FILE = "PARENT-CONFIG-FILE"; + + public TestExecutorsGeneric() { + + } + + public TestExecutorsGeneric(S s) throws ZinggClientException, IOException { + init(s); + } + + public void init(S s) throws ZinggClientException, IOException { + this.session = s; + // set up args + String configFile = setupArgs(); + } + + public String setupArgs() throws ZinggClientException, IOException { + String configFile = getClass().getClassLoader().getResource(getConfigFile()).getFile(); + args = new ArgumentsUtil().createArgumentsFromJSON( + configFile, + "findTrainingData"); + return configFile; + } + + public abstract String getConfigFile(); + + public abstract String getConfigIncrFile(); + + public abstract String getConfigApproveFile(); + + + public void testExecutors(List> executorTesterList) throws ZinggClientException { + for (ExecutorTester executorTester : executorTesterList) { + executorTester.execute(); + executorTester.validateResults(); + } + } + + public abstract void tearDown(); + + public String getFileContentAsStr(String filePath) throws IOException { + + StringBuilder fileContent = new StringBuilder(); + + try ( + InputStream ioStream = this.getClass().getClassLoader().getResourceAsStream(filePath); + InputStreamReader streamReader = new InputStreamReader(ioStream); + BufferedReader reader = new BufferedReader(streamReader); + ) + { + for (String line; (line = reader.readLine()) != null;) { + fileContent.append(line); + fileContent.append("\n"); + } + } + return fileContent.toString(); + } + + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java b/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java new file mode 100644 index 000000000..76d15e708 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java @@ -0,0 +1,19 @@ +package zingg.common.core.executor; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class TrainerTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(TrainerTester.class); + + public TrainerTester(Trainer executor) { + super(executor); + } + + @Override + public void validateResults() { + LOG.info("train successful"); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java b/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java new file mode 100644 index 000000000..945be8ed0 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java @@ -0,0 +1,29 @@ +package zingg.common.core.executor; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; + +public class TrainingDataFinderTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(TrainingDataFinderTester.class); + + public TrainingDataFinderTester(TrainingDataFinder executor) { + super(executor); + } + + @Override + public void validateResults() throws ZinggClientException { + // check that unmarked data has at least 10 rows + ZFrame df = executor.getContext().getPipeUtil().read(false, false, executor.getContext().getPipeUtil().getTrainingDataUnmarkedPipe(executor.getArgs())); + + long trainingDataCount = df.count(); + assertTrue(trainingDataCount > 10); + LOG.info("trainingDataCount : "+ trainingDataCount); + } + +} diff --git a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java b/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java index a3e04b4b0..a519cfe1f 100644 --- a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java +++ b/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java @@ -1,4 +1,4 @@ -package zingg.common.infraForTest.util; +package zingg.common.infra.util; import java.lang.reflect.*; import java.security.NoSuchAlgorithmException; From 91c762e32b796bb3508faf2952a63e630679f297 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 2 Feb 2024 14:15:44 +0530 Subject: [PATCH 049/219] TestSparkExecutors integrated --- .../common/core/executor/MatcherTester.java | 12 +- .../core/executor/TestExecutorsGeneric.java | 33 +---- .../core/executor/JunitSparkLabeller.java | 44 +++++++ .../core/executor/SparkMatcherTester.java | 36 +++++ .../core/executor/TestSparkExecutors.java | 124 ++++++++++++++++++ .../core/executor/configSparkIntTest.json | 95 ++++++++++++++ .../zingg/spark/core/executor/test.csv | 63 +++++++++ 7 files changed, 370 insertions(+), 37 deletions(-) create mode 100644 spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java create mode 100644 spark/core/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java create mode 100644 spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java create mode 100644 spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json create mode 100644 spark/core/src/test/resources/zingg/spark/core/executor/test.csv diff --git a/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java index 7ee3eda04..08485227a 100644 --- a/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java +++ b/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java @@ -31,7 +31,7 @@ protected void assessAccuracy() throws ZinggClientException { df = df.withColumn("fnameId",df.concat(df.col("fname"), df.col("id"))); df = df.select("fnameId", getClusterColName()); - df = df.withColumn("dupeFnameId",substr(df,df.col("fnameId"),0,8)).cache(); + df = df.withColumn("dupeFnameId",substr(df.col("fnameId"),0,8)).cache(); ZFrame df1 = df.withColumnRenamed("fnameId", "fnameId1").withColumnRenamed("dupeFnameId", "dupeFnameId1") .withColumnRenamed(getClusterColName(), getClusterColName() + "1").cache(); @@ -66,17 +66,17 @@ protected ZFrame joinAndFilter(String colName, ZFrame df, ZFra C col1 = df.col(colName); C col2 = df1.col(colName+"1"); ZFrame joined = df.joinOnCol(df1, df.equalTo(col1, col2)); - return joined.filter(gt(joined, joined.col("fnameId"), joined.col("fnameId1"))); + return joined.filter(gt(joined.col("fnameId"), joined.col("fnameId1"))); } // returns df1.intersect(df2) public abstract ZFrame intersect(ZFrame df1, ZFrame df2); - // return df1.substr(col,startPos,len) - public abstract C substr(ZFrame df1, C col, int startPos, int len); + // return col.substr(startPos,len) + public abstract C substr(C col, int startPos, int len); - // return df1.gt(c1, c2) - public abstract C gt(ZFrame df1, C column1, C column2); + // return c1.gt(c2) + public abstract C gt(C column1, C column2); } diff --git a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java index 7625e596e..aff380ab0 100644 --- a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java +++ b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java @@ -1,8 +1,5 @@ package zingg.common.core.executor; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.util.List; import org.apache.commons.logging.Log; @@ -21,8 +18,6 @@ public abstract class TestExecutorsGeneric { protected S session; - protected final String PARENT_CONFIG_FILE = "PARENT-CONFIG-FILE"; - public TestExecutorsGeneric() { } @@ -34,7 +29,7 @@ public TestExecutorsGeneric(S s) throws ZinggClientException, IOException { public void init(S s) throws ZinggClientException, IOException { this.session = s; // set up args - String configFile = setupArgs(); + setupArgs(); } public String setupArgs() throws ZinggClientException, IOException { @@ -47,11 +42,6 @@ public String setupArgs() throws ZinggClientException, IOException { public abstract String getConfigFile(); - public abstract String getConfigIncrFile(); - - public abstract String getConfigApproveFile(); - - public void testExecutors(List> executorTesterList) throws ZinggClientException { for (ExecutorTester executorTester : executorTesterList) { executorTester.execute(); @@ -59,25 +49,6 @@ public void testExecutors(List> executorTesterList } } - public abstract void tearDown(); - - public String getFileContentAsStr(String filePath) throws IOException { - - StringBuilder fileContent = new StringBuilder(); - - try ( - InputStream ioStream = this.getClass().getClassLoader().getResourceAsStream(filePath); - InputStreamReader streamReader = new InputStreamReader(ioStream); - BufferedReader reader = new BufferedReader(streamReader); - ) - { - for (String line; (line = reader.readLine()) != null;) { - fileContent.append(line); - fileContent.append("\n"); - } - } - return fileContent.toString(); - } - + public abstract void tearDown(); } diff --git a/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java b/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java new file mode 100644 index 000000000..ba1ed9372 --- /dev/null +++ b/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java @@ -0,0 +1,44 @@ +package zingg.spark.core.executor; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; + +import zingg.common.client.IArguments; +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.options.ZinggOptions; +import zingg.common.core.executor.JunitLabeller; +import zingg.spark.core.context.ZinggSparkContext; + +public class JunitSparkLabeller extends SparkLabeller { + + private static final long serialVersionUID = 1L; + + JunitLabeller,Row,Column,DataType> junitLabeller; + + public JunitSparkLabeller() { + this(new ZinggSparkContext()); + } + + public JunitSparkLabeller(ZinggSparkContext sparkContext) { + setZinggOption(ZinggOptions.LABEL); + setContext(sparkContext); + junitLabeller = new JunitLabeller,Row,Column,DataType>(sparkContext); + } + + @Override + public void setArgs(IArguments args) { + super.setArgs(args); + junitLabeller.setArgs(args); + } + + @Override + public ZFrame,Row,Column> processRecordsCli(ZFrame,Row,Column> lines) + throws ZinggClientException { + return junitLabeller.processRecordsCli(lines); + } +} + diff --git a/spark/core/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java b/spark/core/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java new file mode 100644 index 000000000..d94691e21 --- /dev/null +++ b/spark/core/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java @@ -0,0 +1,36 @@ +package zingg.spark.core.executor; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; + +import zingg.common.client.ZFrame; +import zingg.common.core.executor.Matcher; +import zingg.common.core.executor.MatcherTester; +import zingg.spark.client.SparkFrame; + +public class SparkMatcherTester extends MatcherTester,Row,Column,DataType> { + + public SparkMatcherTester(Matcher, Row, Column, DataType> executor) { + super(executor); + } + + @Override + public ZFrame, Row, Column> intersect(ZFrame, Row, Column> df1, + ZFrame, Row, Column> df2) { + return new SparkFrame(df1.df().intersect(df2.df())); + } + + @Override + public Column substr(Column col, int startPos, int len) { + return col.substr(startPos,len); + } + + @Override + public Column gt(Column column1, Column column2) { + return column1.gt(column2); + } + +} diff --git a/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java new file mode 100644 index 000000000..4995e51ed --- /dev/null +++ b/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java @@ -0,0 +1,124 @@ +package zingg.spark.core.executor; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +import zingg.common.client.ZinggClientException; +import zingg.common.core.executor.ExecutorTester; +import zingg.common.core.executor.Labeller; +import zingg.common.core.executor.LabellerTester; +import zingg.common.core.executor.MatcherTester; +import zingg.common.core.executor.TestExecutorsGeneric; +import zingg.common.core.executor.TrainerTester; +import zingg.common.core.executor.TrainingDataFinderTester; +import zingg.spark.core.context.ZinggSparkContext; + +public class TestSparkExecutors extends TestExecutorsGeneric,Row,Column,DataType> { + protected static final String CONFIG_FILE = "zingg/spark/core/executor/configSparkIntTest.json"; + + protected static final String TEST_DATA_FILE = "zingg/spark/core/executor/test.csv"; + + public static final Log LOG = LogFactory.getLog(TestSparkExecutors.class); + + protected ZinggSparkContext ctx; + + + public TestSparkExecutors() throws IOException, ZinggClientException { + SparkSession spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + this.ctx = new ZinggSparkContext(); + this.ctx.setSession(spark); + this.ctx.setUtils(); + init(spark); + } + + @Override + public String getConfigFile() { + return CONFIG_FILE; + } + + @Test + public void testExecutors() throws ZinggClientException { + List,Row,Column,DataType>> executorTesterList = new ArrayList,Row,Column,DataType>>(); + + TrainingDataFinderTester,Row,Column,DataType> tdft = new TrainingDataFinderTester,Row,Column,DataType>(getTrainingDataFinder()); + executorTesterList.add(tdft); + + LabellerTester,Row,Column,DataType> lt = new LabellerTester,Row,Column,DataType>(getLabeller()); + executorTesterList.add(lt); + + // training and labelling needed twice to get sufficient data + TrainingDataFinderTester,Row,Column,DataType> tdft2 = new TrainingDataFinderTester,Row,Column,DataType>(getTrainingDataFinder()); + executorTesterList.add(tdft2); + + LabellerTester,Row,Column,DataType> lt2 = new LabellerTester,Row,Column,DataType>(getLabeller()); + executorTesterList.add(lt2); + + TrainerTester,Row,Column,DataType> tt = new TrainerTester,Row,Column,DataType>(getTrainer()); + executorTesterList.add(tt); + + MatcherTester,Row,Column,DataType> mt = new SparkMatcherTester(getMatcher()); + executorTesterList.add(mt); + + super.testExecutors(executorTesterList); + } + + protected SparkTrainingDataFinder getTrainingDataFinder() throws ZinggClientException { + SparkTrainingDataFinder stdf = new SparkTrainingDataFinder(ctx); + stdf.init(args); + return stdf; + } + + protected Labeller,Row,Column,DataType> getLabeller() throws ZinggClientException { + JunitSparkLabeller jlbl = new JunitSparkLabeller(ctx); + jlbl.init(args); + return jlbl; + } + + protected SparkTrainer getTrainer() throws ZinggClientException { + SparkTrainer st = new SparkTrainer(ctx); + st.init(args); + return st; + } + + protected SparkMatcher getMatcher() throws ZinggClientException { + SparkMatcher sm = new SparkMatcher(ctx); + sm.init(args); + return sm; + } + + + @Override + public String setupArgs() throws ZinggClientException, IOException { + String configFile = super.setupArgs(); + String testFile = getClass().getClassLoader().getResource(TEST_DATA_FILE).getFile(); + // correct the location of test data + args.getData()[0].setProp("location", testFile); + return configFile; + } + + @Override + @AfterEach + public void tearDown() { + // just rename, would be removed automatically as it's in /tmp + File dir = new File(args.getZinggDir()); + File newDir = new File(dir.getParent() + "/zingg_junit_" + System.currentTimeMillis()); + dir.renameTo(newDir); + } + +} diff --git a/spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json b/spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json new file mode 100644 index 000000000..0ef68c004 --- /dev/null +++ b/spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json @@ -0,0 +1,95 @@ +{ + "fieldDefinition":[ + { + "fieldName" : "id", + "matchType" : "dont_use", + "fields" : "id", + "dataType": "string" + }, + { + "fieldName" : "fname", + "matchType" : "fuzzy", + "fields" : "fname", + "dataType": "string" + }, + { + "fieldName" : "lname", + "matchType" : "fuzzy", + "fields" : "lname", + "dataType": "string" + }, + { + "fieldName" : "stNo", + "matchType": "fuzzy", + "fields" : "stNo", + "dataType": "string" + }, + { + "fieldName" : "add1", + "matchType": "fuzzy", + "fields" : "add1", + "dataType": "string" + }, + { + "fieldName" : "add2", + "matchType": "fuzzy", + "fields" : "add2", + "dataType": "string" + }, + { + "fieldName" : "city", + "matchType": "fuzzy", + "fields" : "city", + "dataType": "string" + }, + { + "fieldName" : "areacode", + "matchType": "fuzzy", + "fields" : "areacode", + "dataType": "string" + }, + { + "fieldName" : "state", + "matchType": "fuzzy", + "fields" : "state", + "dataType": "string" + }, + { + "fieldName" : "dob", + "matchType": "fuzzy", + "fields" : "dob", + "dataType": "string" + }, + { + "fieldName" : "ssn", + "matchType": "fuzzy", + "fields" : "ssn", + "dataType": "string" + } + ], + "output" : [{ + "name":"output", + "format":"csv", + "props": { + "location": "/tmp/junit_integration_spark/zinggOutput", + "delimiter": ",", + "header":true + } + }], + "data" : [{ + "name":"test", + "format":"csv", + "props": { + "location": "test.csv", + "delimiter": ",", + "header":false + }, + "schema": "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string" + } + ], + "labelDataSampleSize" : 0.5, + "numPartitions":4, + "modelId": "junit_integration_spark", + "zinggDir": "/tmp/junit_integration_spark" + +} diff --git a/spark/core/src/test/resources/zingg/spark/core/executor/test.csv b/spark/core/src/test/resources/zingg/spark/core/executor/test.csv new file mode 100644 index 000000000..4473ef4c2 --- /dev/null +++ b/spark/core/src/test/resources/zingg/spark/core/executor/test.csv @@ -0,0 +1,63 @@ +rec-1020-org, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765 +rec-1021-dup-0, thomas, georgze,1, mcmanus place, , north turarmurra,3130, sa,19630225,5460534 +rec-1021-org, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534 +rec-1022-dup-1, Érik, Guay,840, mountview, fowles treet, burlei gh heads,2803, sa,19830807,2932837 +rec-1022-dup-2, Érik, Guay,840, fowles street, moun tvjiew, burleigh heads,2830, ss, ,2932837 +rec-1022-dup-3, jackson, christo,840, fowles street, mou ntveiw, burleig heads,2830, sa,19830807,2932837 +rec-1022-dup-4, jackson, eglinton,840, fowles street, mountv iew, burleigh heads,2830, sa,19830807,2932837 +rec-1022-org, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837 +rec-1023-org, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080 +rec-1024-org, takeisha, freeborn,6, suttor street, the groves street, wentworth falls,4615, vic,19620206,8111362 +rec-1025-org, emiily, britten,8, kitchener street, hilltop hostel rowethorpe, lake heights,2463, qld,19491021,9588775 +rec-1026-dup-0, xani, green, , phill ip avenue, , armidale,5108, nsw,19390410,9201057 +rec-1026-dup-1, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201857 +rec-1026-org, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201057 +rec-1027-org, nathan, smallacombe,20, guthridge crescent, red cross units, sandy bay,6056, sa,19241223,7522263 +rec-1028-dup-0, , ,24, , woorinyan, riverwood,3749, qld,19180205,9341716 +rec-1028-dup-1, , eglinton,24, curriecrescent, woorinyan, riverwood,3749, qld,19180205,1909717 +rec-1028-org, , eglinton,24, currie crescent, woorinyan, riverwood,3749, qld,19180205,9341716 +rec-1029-dup-0, kylee, stepehndon,81, rose scott circuit, cordobak anor, ashfield,4226, vic,19461101,4783085 +rec-1029-dup-1, sachin, stephenson,81, rose scott circuit, cordoba manor, ashfi eld,4226, vic,19461101,4783085 +rec-1029-dup-2, annalise, stephenson,81, rose scott circuit, cordoba manor, ashfoeld,4226, vic,19461101,4783085 +rec-1029-dup-3, kykee, turale,81, rose scott circuit, , ashfield,4226, vic,19461101,4783085 +rec-1029-dup-4, kylee, stephenson,81, cordoba manor, rose scott circuit, ashfield,4226, vic,19461101,4783085 +rec-1029-org, kylee, stephenson,81, rose scott circuit, cordoba manor, ashfield,4226, vic,19461101,4783085 +rec-103-dup-0, benjamin, koerbin,15, wybel anah, violet grover place, mill park,2446, nsw,19210210,3808808 +rec-103-org, briony, koerbin,146, violet grover place, wybelanah, mill park,2446, nsw,19210210,3808808 +rec-1030-org, emma, crossman,53, mcdowall place, kellhaven, tara,5608, vic,19391027,3561186 +rec-1031-org, samantha, sabieray,68, quandong street, wattle brae, gorokan,4019, wa,19590807,2863290 +rec-1032-dup-0, brooklyn, naar-cafentas,210, duffy street, tourist psrk, berwick,2481, nsw, ,3624304 +rec-1032-org, brooklyn, naar-cafentas,210, duffy street, tourist park, berwick,2481, nsw,19840802,3624304 +rec-1033-dup-0, keziah, painter,18, ainsli e avenue, sec 1, torquay,3205, vic,19191031,7801066 +rec-1033-org, keziah, painter,18, ainslie avenue, sec 1, torquay,3205, vic,19191031,7801066 +rec-1034-dup-0, erin, maynard,24, , wariala, little river,2777, vic,19970430,7429462 +rec-1034-dup-1, erin, maynard,51, wilshire street, warialda, little irver,2777, vic,19970430,1815999 +rec-1034-dup-2, hayley, maynard,14, wilshire street, , little river,2777, vic,19970430,7429462 +rec-1034-org, erin, maynard,14, wilshire street, warialda, little river,2777, vic,19970430,7429462 +rec-1035-dup-0, jaiden, rollins,48, tulgeywood, rossarden street, balwyn north,2224, nt,19280722,7626396 +rec-1035-dup-1, jaiden, rollins,95, rossarden street, tulgewyood, balwyn north,2224, nt,19280722,7626396 +rec-1035-dup-2, jaiden, rolilns,48, swinden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 +rec-1035-dup-3, jaiden, rolli ns,48, tulgeywomod, rossarden street, balwyn north,2224, nf,19280722,7626396 +rec-1035-org, jaiden, rollins,48, rossarden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 +rec-1036-dup-0, , held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 +rec-1036-dup-1, sarsha, held,42, lampard circuit, , golden bay,2447, vic,19510806,3710651 +rec-1036-org, amber, held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 +rec-1037-org, connor, beckwith,10, heard street, , mill park,5031, nsw,19081103,2209091 +rec-1038-org, danny, campbell,95, totterdell street, moama, shellharbour,2209, vic,19951105,9554924 +rec-1039-dup-0, angus, roas,62, gormansto crescent, mlc centre, kiruwah,3350, sa,19250817,2655081 +rec-1039-org, angus, rosa,62, gormanston crescent, mlc centre, kirwan,3350, sa,19250817,2655081 +rec-104-dup-0, benjaminl, carbone,18, arthella, wattle s treet, orange,3550, vic,19050820,3677127 +rec-104-org, benjamin, carbone,18, wattle street, arthella, orange,3550, vic,19050820,3677127 +rec-1040-dup-0, matilda, mestrov, , housecicuit, retirement village, taringa,3820, qld,19801119,2536135 +rec-1040-dup-1, matilda, mestrv,5, house circuit, retirement village, taringa,3802, qld,19801119,2563135 +rec-1040-dup-2, matilda, mestrov,5, house circuit, retiremen tvillage, taringa,3820, ,19801119,2563135 +rec-1040-org, matilda, mestrov,5, house circuit, retirement village, taringa,3820, qld,19801119,2563135 +rec-1041-dup-0, tyler, frojd, , burramurra avenue, kmart p plaza, san rmeo,3670, sa,19800916,7812219 +rec-1041-org, tyler, froud,8, burramurra avenue, kmart p plaza, san remo,3670, sa,19800916,7812219 +rec-1042-dup-0, kiandra, ,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 +rec-1042-dup-1, kiandra, cowle,2, gatliff place, rustenubr g sth, girgarre,3995, qld,19801125,3328205 +rec-1042-org, kiandra, cowle,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 +rec-1043-org, giorgia, frahn,62, handasyde street, ramano estate locn 1, tallebudgera,4506, vic,19670206,9724789 +rec-1044-dup-0, nicole, shadbolt,46, schlich s treet, simpson army barracks, toowoomba,3000, wa,19030926,8190756 +rec-1044-dup-2, nicole, carbone,46, schlich street, simpson arm ybarracks, toowong,3000, wa,19030926,8190756 +rec-1044-org, nicole, carbone,46, schlich street, simpson army barracks, toowoomba,3000, wa,19030926,8190756 From 722c43d177e97ff2823da716b813d9fb15679fe0 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Mon, 5 Feb 2024 12:30:30 +0530 Subject: [PATCH 050/219] javadoc support for generated python code --- .../py/processors/PythonClassProcessor.java | 7 + .../py/processors/PythonMethodProcessor.java | 8 ++ python/zingg/ArgumentsGenerated.py | 125 ++++++++++++++++++ python/zingg/FieldDefinitionGenerated.py | 13 ++ python/zingg/PipeGenerated.py | 5 + 5 files changed, 158 insertions(+) diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index e15c0ffee..b9ddb3721 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -79,6 +79,13 @@ private void generateImportsAndDeclarations(Element element, FileWriter fileWrit fileWriter.write("JStructType = getJVM().org.apache.spark.sql.types.StructType\n"); fileWriter.write("\n"); } + + String javadoc = processingEnv.getElementUtils().getDocComment(element); + if (javadoc != null) { + fileWriter.write("'''\n"); + fileWriter.write(javadoc.trim()); + fileWriter.write("\n'''\n"); + } } private void generateClassInitializationCode(TypeElement classElement, Element element, FileWriter fileWriter) throws IOException { diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 6de712703..1971adcb2 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -36,6 +36,14 @@ public boolean process(Set annotations, RoundEnvironment if (methodNames.contains(methodElement.getSimpleName().toString())) { try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) { + + String javadoc = processingEnv.getElementUtils().getDocComment(methodElement); + if (javadoc != null) { + fileWriter.write(" '''\n"); + fileWriter.write(javadoc.trim()); + fileWriter.write("\n '''\n"); + } + fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n"); generateMethodReturn(methodElement, fileWriter); generateFieldAssignment(methodElement, fileWriter); diff --git a/python/zingg/ArgumentsGenerated.py b/python/zingg/ArgumentsGenerated.py index e03fc961c..bafb8d96d 100644 --- a/python/zingg/ArgumentsGenerated.py +++ b/python/zingg/ArgumentsGenerated.py @@ -1,4 +1,64 @@ from zingg.otherThanGenerated import * +''' +This class helps supply match arguments to Zingg. There are 3 basic steps + in any match process. +
    +
  • Defining - specifying information about data location, fields and our + notion of similarity. +
  • Training - making Zingg learn the matching rules +
  • Matching - Running the models on entire dataset +
+

+ There is another step, creating labeled data, which can be used to create + training data if none is present. Let us cover them in greater detail through + an example. +

+ We have some positive and negative labeled examples from which we want + Zingg to learn. These are saved in +

+ /path/to/training/data/positive.csv and +

+ /path/to/training/data/negative.csv +

+ Our actual data has colA,colB,colC,colD,colE with comma as the delimiter and + is saved at +

+ /path/to/match/data.csv. +

+ We want to match on colB and colD only, one of which is String and other is + int +

+ Our program would look like + +

+ {
+ 	@code
+ 	Arguments args = new Arguments();
+ 	args.setDelimiter(",");
+ 	args.setPositiveTrainingSamples("/path/to/training/data/positive.csv");
+ 	args.setNegativeTrainingSamples("/path/to/training/data/negative.csv");
+ 
+ 	FieldDefinition colB = new FieldDefinition(1, FieldClass.STRING,
+ 			FieldType.WORD);
+ 	FieldDefinition colD = new FieldDefinition(3, FieldClass.INTEGER,
+ 			FieldType.NUMERIC);
+ 
+ 	List<FieldDefinition> fields = new ArrayList<FieldDefinition>();
+ 	fields.add(colB);
+ 	fields.add(colD);
+ 	args.setFieldDefinition(fields);
+ 
+ 	args.setMatchData("/path/to/match/data.csv");
+ 
+ 	args.setZinggDir("/path/to/models");
+ 	args.setOutputDir("/path/to/match/output");
+ 
+ 	Client client = new Client(args, "local");
+ 	client.train();
+ 	client.run();
+ }
+ 
+''' class Arguments: def __init__(self): self.arguments = getJVM().zingg.common.client.Arguments() @@ -6,33 +66,98 @@ def __init__(self): def setNumPartitions(self, numPartitions): self.arguments.setNumPartitions(numPartitions) + ''' +Set the fraction of data to be used from complete data set to be used for + seeding the labelled data Labelling is costly and we want a fast + approximate way of looking at a small sample of the records and + identifying expected matches and non matches + + @param labelDataSampleSize + - float between 0 and 1 denoting portion of dataset to use in + generating seed samples + @throws ZinggClientException + ''' def setLabelDataSampleSize(self, labelDataSampleSize): self.arguments.setLabelDataSampleSize(labelDataSampleSize) + ''' +Location for internal Zingg use. + + @return the path for internal Zingg usage + + public Pipe[] getZinggInternal() { + return zinggInternal; + } + + /** + Set the location for Zingg to save its internal computations and + models. Please set it to a place where the program has write access. + + @param zinggDir + path to the Zingg directory + + public void setZinggInternal(Pipe[] zinggDir) { + this.zinggInternal = zinggDir; + } + ''' def getModelId(self): return self.arguments.getModelId() def setModelId(self, modelId): self.arguments.setModelId(modelId) + ''' +Set the output directory where the match result will be saved + + @param outputDir + where the match result is saved + @throws ZinggClientException + ''' def setOutput(self, outputDir): self.arguments.setOutput(outputDir) + ''' +Set the location for Zingg to save its internal computations and + models. Please set it to a place where the program has write access. + + @param zinggDir + path to the Zingg directory + ''' def setZinggDir(self, zinggDir): self.arguments.setZinggDir(zinggDir) + ''' +Location for internal Zingg use. + + @return the path for internal Zingg usage + ''' def getZinggBaseModelDir(self): return self.arguments.getZinggBaseModelDir() def getZinggModelDir(self): return self.arguments.getZinggModelDir() + ''' +Location for internal Zingg use. + + @return the path for internal Zingg usage + ''' def getZinggBaseTrainingDataDir(self): return self.arguments.getZinggBaseTrainingDataDir() + ''' +Location for internal Zingg use. + + @return the path for internal Zingg usage + ''' def getZinggTrainingDataUnmarkedDir(self): return self.arguments.getZinggTrainingDataUnmarkedDir() + ''' +Location for internal Zingg use. + + @return the path for internal Zingg usage + ''' def getZinggTrainingDataMarkedDir(self): return self.arguments.getZinggTrainingDataMarkedDir() diff --git a/python/zingg/FieldDefinitionGenerated.py b/python/zingg/FieldDefinitionGenerated.py index 4713552bd..b08d75984 100644 --- a/python/zingg/FieldDefinitionGenerated.py +++ b/python/zingg/FieldDefinitionGenerated.py @@ -1,4 +1,10 @@ from zingg.otherThanGenerated import * +''' +This class defines each field that we use in matching We can use this to + configure the properties of each field we use for matching in Zingg. + + @author sgoyal +''' class FieldDefinition: def __init__(self, name, dataType, *matchType): self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() @@ -13,6 +19,13 @@ def getFieldDefinition(self): def setFields(self, fields): self.fielddefinition.setFields(fields) + ''' +Set the field type which defines the kind of matching we want to do + + @see MatchType + @param type + the type to set + ''' def setMatchType(self, type): self.fielddefinition.setMatchType(type) diff --git a/python/zingg/PipeGenerated.py b/python/zingg/PipeGenerated.py index 6144a7177..326404a11 100644 --- a/python/zingg/PipeGenerated.py +++ b/python/zingg/PipeGenerated.py @@ -6,6 +6,11 @@ FilePipe = getJVM().zingg.common.client.pipe.FilePipe JStructType = getJVM().org.apache.spark.sql.types.StructType +''' +Actual pipe def in the args. One pipe can be used at multiple places with different tables, locations, queries etc + + @author sgoyal +''' class Pipe: def __init__(self, name, format): self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe() From 44e1b1abe8ed9a957f6386bb4a524dbe9e026ca5 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 7 Feb 2024 15:25:24 +0530 Subject: [PATCH 051/219] refactor integrations tests for reuse --- common/common-test/pom.xml | 23 +++++++++++++ .../common/core/executor/ExecutorTester.java | 0 .../common/core/executor/JunitLabeller.java | 0 .../common/core/executor/LabellerTester.java | 0 .../common/core/executor/MatcherTester.java | 0 .../core/executor/TestExecutorsGeneric.java | 0 .../common/core/executor/TrainerTester.java | 0 .../executor/TrainingDataFinderTester.java | 0 spark/spark-test/pom.xml | 33 +++++++++++++++++++ .../core/executor/JunitSparkLabeller.java | 0 .../core/executor/SparkMatcherTester.java | 0 .../core/executor/TestSparkExecutors.java | 0 .../core/executor/configSparkIntTest.json | 0 .../zingg/spark/core/executor/test.csv | 0 14 files changed, 56 insertions(+) create mode 100644 common/common-test/pom.xml rename common/{core/src/test => common-test/src/main}/java/zingg/common/core/executor/ExecutorTester.java (100%) rename common/{core/src/test => common-test/src/main}/java/zingg/common/core/executor/JunitLabeller.java (100%) rename common/{core/src/test => common-test/src/main}/java/zingg/common/core/executor/LabellerTester.java (100%) rename common/{core/src/test => common-test/src/main}/java/zingg/common/core/executor/MatcherTester.java (100%) rename common/{core/src/test => common-test/src/main}/java/zingg/common/core/executor/TestExecutorsGeneric.java (100%) rename common/{core/src/test => common-test/src/main}/java/zingg/common/core/executor/TrainerTester.java (100%) rename common/{core/src/test => common-test/src/main}/java/zingg/common/core/executor/TrainingDataFinderTester.java (100%) create mode 100644 spark/spark-test/pom.xml rename spark/{core/src/test => spark-test/src/main}/java/zingg/spark/core/executor/JunitSparkLabeller.java (100%) rename spark/{core/src/test => spark-test/src/main}/java/zingg/spark/core/executor/SparkMatcherTester.java (100%) rename spark/{core/src/test => spark-test/src/main}/java/zingg/spark/core/executor/TestSparkExecutors.java (100%) rename spark/{core/src/test => spark-test/src/main}/resources/zingg/spark/core/executor/configSparkIntTest.json (100%) rename spark/{core/src/test => spark-test/src/main}/resources/zingg/spark/core/executor/test.csv (100%) diff --git a/common/common-test/pom.xml b/common/common-test/pom.xml new file mode 100644 index 000000000..23550a3ea --- /dev/null +++ b/common/common-test/pom.xml @@ -0,0 +1,23 @@ + + 4.0.0 + + zingg + zingg-common + ${zingg.version} + + zingg-common-test + jar + + + zingg + zingg-common-core + ${zingg.version} + + + zingg + zingg-common-client + ${zingg.version} + + + diff --git a/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java b/common/common-test/src/main/java/zingg/common/core/executor/ExecutorTester.java similarity index 100% rename from common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java rename to common/common-test/src/main/java/zingg/common/core/executor/ExecutorTester.java diff --git a/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java b/common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java similarity index 100% rename from common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java rename to common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java diff --git a/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java b/common/common-test/src/main/java/zingg/common/core/executor/LabellerTester.java similarity index 100% rename from common/core/src/test/java/zingg/common/core/executor/LabellerTester.java rename to common/common-test/src/main/java/zingg/common/core/executor/LabellerTester.java diff --git a/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/common-test/src/main/java/zingg/common/core/executor/MatcherTester.java similarity index 100% rename from common/core/src/test/java/zingg/common/core/executor/MatcherTester.java rename to common/common-test/src/main/java/zingg/common/core/executor/MatcherTester.java diff --git a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/common-test/src/main/java/zingg/common/core/executor/TestExecutorsGeneric.java similarity index 100% rename from common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java rename to common/common-test/src/main/java/zingg/common/core/executor/TestExecutorsGeneric.java diff --git a/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java b/common/common-test/src/main/java/zingg/common/core/executor/TrainerTester.java similarity index 100% rename from common/core/src/test/java/zingg/common/core/executor/TrainerTester.java rename to common/common-test/src/main/java/zingg/common/core/executor/TrainerTester.java diff --git a/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java b/common/common-test/src/main/java/zingg/common/core/executor/TrainingDataFinderTester.java similarity index 100% rename from common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java rename to common/common-test/src/main/java/zingg/common/core/executor/TrainingDataFinderTester.java diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml new file mode 100644 index 000000000..57c9c4333 --- /dev/null +++ b/spark/spark-test/pom.xml @@ -0,0 +1,33 @@ + + 4.0.0 + + zingg + zingg-spark + ${zingg.version} + + zingg-spark-test + jar + + + zingg + zingg-spark-core + ${zingg.version} + + + zingg + zingg-spark-client + ${zingg.version} + + + zingg + zingg-common-core + ${zingg.version} + + + zingg + zingg-common-client + ${zingg.version} + + + diff --git a/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java b/spark/spark-test/src/main/java/zingg/spark/core/executor/JunitSparkLabeller.java similarity index 100% rename from spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java rename to spark/spark-test/src/main/java/zingg/spark/core/executor/JunitSparkLabeller.java diff --git a/spark/core/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java b/spark/spark-test/src/main/java/zingg/spark/core/executor/SparkMatcherTester.java similarity index 100% rename from spark/core/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java rename to spark/spark-test/src/main/java/zingg/spark/core/executor/SparkMatcherTester.java diff --git a/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/spark-test/src/main/java/zingg/spark/core/executor/TestSparkExecutors.java similarity index 100% rename from spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java rename to spark/spark-test/src/main/java/zingg/spark/core/executor/TestSparkExecutors.java diff --git a/spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json b/spark/spark-test/src/main/resources/zingg/spark/core/executor/configSparkIntTest.json similarity index 100% rename from spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json rename to spark/spark-test/src/main/resources/zingg/spark/core/executor/configSparkIntTest.json diff --git a/spark/core/src/test/resources/zingg/spark/core/executor/test.csv b/spark/spark-test/src/main/resources/zingg/spark/core/executor/test.csv similarity index 100% rename from spark/core/src/test/resources/zingg/spark/core/executor/test.csv rename to spark/spark-test/src/main/resources/zingg/spark/core/executor/test.csv From 8a48c061364db9f6f560609956ddcd377ef85930 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 7 Feb 2024 15:27:09 +0530 Subject: [PATCH 052/219] test modules in build xml --- .../main/java/zingg/common/core/executor/JunitLabeller.java | 1 - common/pom.xml | 1 + spark/pom.xml | 1 + spark/spark-test/pom.xml | 5 +++++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java b/common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java index 8b4a66874..ecdba92f4 100644 --- a/common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java +++ b/common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java @@ -6,7 +6,6 @@ import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; import zingg.common.core.context.Context; -import zingg.common.core.executor.Labeller; public class JunitLabeller extends Labeller { diff --git a/common/pom.xml b/common/pom.xml index 23dd19064..cce9085d8 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -12,5 +12,6 @@ infra core client + common-test diff --git a/spark/pom.xml b/spark/pom.xml index 2ea784073..3f7444b54 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -11,6 +11,7 @@ core client + spark-test diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml index 57c9c4333..cd34eb877 100644 --- a/spark/spark-test/pom.xml +++ b/spark/spark-test/pom.xml @@ -29,5 +29,10 @@ zingg-common-client ${zingg.version} + + zingg + zingg-common-test + ${zingg.version} + From b33ad63a9d544ab223f7a64ebb72b6f790399b2b Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 7 Feb 2024 15:53:46 +0530 Subject: [PATCH 053/219] junit dependency --- common/common-test/pom.xml | 15 +++++++++++++++ spark/spark-test/pom.xml | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/common/common-test/pom.xml b/common/common-test/pom.xml index 23550a3ea..082f73e33 100644 --- a/common/common-test/pom.xml +++ b/common/common-test/pom.xml @@ -19,5 +19,20 @@ zingg-common-client ${zingg.version}
+ + org.junit.jupiter + junit-jupiter-engine + 5.8.1 + + + org.junit.jupiter + junit-jupiter-api + 5.8.1 + + + org.junit.jupiter + junit-jupiter-params + 5.8.1 +
diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml index cd34eb877..b952e608c 100644 --- a/spark/spark-test/pom.xml +++ b/spark/spark-test/pom.xml @@ -34,5 +34,20 @@ zingg-common-test ${zingg.version} + + org.junit.jupiter + junit-jupiter-engine + 5.8.1 + + + org.junit.jupiter + junit-jupiter-api + 5.8.1 + + + org.junit.jupiter + junit-jupiter-params + 5.8.1 + From 3caedc58acd8a810cc34e8204d049b9c7f33f0fd Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 7 Feb 2024 17:00:34 +0530 Subject: [PATCH 054/219] Annotation changes --- .../py/processors/PythonClassProcessor.java | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index b9ddb3721..17bd7bdba 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -36,9 +36,12 @@ public boolean process(Set annotations, RoundEnvironment if (element.getKind() == ElementKind.CLASS) { TypeElement classElement = (TypeElement) element; PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); + String packageName = packageElement.getQualifiedName().toString(); List methodNames = new ArrayList<>(); + + String outputDirectory = determineOutputDirectory(packageName); - try (FileWriter fileWriter = new FileWriter("python/zingg"+ File.separator + element.getSimpleName() + "Generated.py")) { + try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) { generateImportsAndDeclarations(element, fileWriter); fileWriter.write("class " + element.getSimpleName() + ":\n"); @@ -68,6 +71,18 @@ Map> getClassMethodsMap() { return classMethodsMap; } + private String determineOutputDirectory(String packageName) { + if (packageName.contains("enterprise") && packageName.contains("common")) { + return "common/python"; + } else if (packageName.contains("enterprise") && packageName.contains("snowflake")) { + return "snowflake/python"; + } else if (packageName.contains("enterprise") && packageName.contains("spark")) { + return "spark/python"; + } else { + return "python/zingg"; + } + } + private void generateImportsAndDeclarations(Element element, FileWriter fileWriter) throws IOException { fileWriter.write("from zingg.otherThanGenerated import *\n"); if (element.getSimpleName().contentEquals("Pipe")) { @@ -94,9 +109,16 @@ private void generateClassInitializationCode(TypeElement classElement, Element e fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setName(name)\n"); fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFormat(format)\n"); } + else if (element.getSimpleName().contentEquals("EPipe")) { + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()\n"); + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setPassthroughExpr(passthroughExpr)\n"); + } else if (element.getSimpleName().contentEquals("Arguments")) { fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n"); } + else if (element.getSimpleName().contentEquals("EArguments")) { + fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n"); + } else if (element.getSimpleName().contentEquals("FieldDefinition")) { fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.FieldDefinition()\n"); fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFieldName(name)\n"); From f0dbcf3781d275d0a6d0480384015c1e8fe5173d Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 8 Feb 2024 13:53:18 +0530 Subject: [PATCH 055/219] moved to test folder --- .../java/zingg/spark/core/executor/JunitSparkLabeller.java | 0 .../java/zingg/spark/core/executor/SparkMatcherTester.java | 0 .../java/zingg/spark/core/executor/TestSparkExecutors.java | 0 .../resources/zingg/spark/core/executor/configSparkIntTest.json | 0 .../{main => test}/resources/zingg/spark/core/executor/test.csv | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename spark/spark-test/src/{main => test}/java/zingg/spark/core/executor/JunitSparkLabeller.java (100%) rename spark/spark-test/src/{main => test}/java/zingg/spark/core/executor/SparkMatcherTester.java (100%) rename spark/spark-test/src/{main => test}/java/zingg/spark/core/executor/TestSparkExecutors.java (100%) rename spark/spark-test/src/{main => test}/resources/zingg/spark/core/executor/configSparkIntTest.json (100%) rename spark/spark-test/src/{main => test}/resources/zingg/spark/core/executor/test.csv (100%) diff --git a/spark/spark-test/src/main/java/zingg/spark/core/executor/JunitSparkLabeller.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java similarity index 100% rename from spark/spark-test/src/main/java/zingg/spark/core/executor/JunitSparkLabeller.java rename to spark/spark-test/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java diff --git a/spark/spark-test/src/main/java/zingg/spark/core/executor/SparkMatcherTester.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java similarity index 100% rename from spark/spark-test/src/main/java/zingg/spark/core/executor/SparkMatcherTester.java rename to spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java diff --git a/spark/spark-test/src/main/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java similarity index 100% rename from spark/spark-test/src/main/java/zingg/spark/core/executor/TestSparkExecutors.java rename to spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java diff --git a/spark/spark-test/src/main/resources/zingg/spark/core/executor/configSparkIntTest.json b/spark/spark-test/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json similarity index 100% rename from spark/spark-test/src/main/resources/zingg/spark/core/executor/configSparkIntTest.json rename to spark/spark-test/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json diff --git a/spark/spark-test/src/main/resources/zingg/spark/core/executor/test.csv b/spark/spark-test/src/test/resources/zingg/spark/core/executor/test.csv similarity index 100% rename from spark/spark-test/src/main/resources/zingg/spark/core/executor/test.csv rename to spark/spark-test/src/test/resources/zingg/spark/core/executor/test.csv From 7837c3c2e7c31980410c3027c2519c6fd3162e21 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 8 Feb 2024 14:23:37 +0530 Subject: [PATCH 056/219] move to test and generate test jars --- common/common-test/pom.xml | 3 +++ .../zingg/common/core/executor/ExecutorTester.java | 0 .../zingg/common/core/executor/JunitLabeller.java | 0 .../zingg/common/core/executor/LabellerTester.java | 0 .../zingg/common/core/executor/MatcherTester.java | 0 .../common/core/executor/TestExecutorsGeneric.java | 0 .../zingg/common/core/executor/TrainerTester.java | 0 .../core/executor/TrainingDataFinderTester.java | 0 pom.xml | 12 ++++++++++++ spark/spark-test/pom.xml | 6 ++++++ 10 files changed, 21 insertions(+) rename common/common-test/src/{main => test}/java/zingg/common/core/executor/ExecutorTester.java (100%) rename common/common-test/src/{main => test}/java/zingg/common/core/executor/JunitLabeller.java (100%) rename common/common-test/src/{main => test}/java/zingg/common/core/executor/LabellerTester.java (100%) rename common/common-test/src/{main => test}/java/zingg/common/core/executor/MatcherTester.java (100%) rename common/common-test/src/{main => test}/java/zingg/common/core/executor/TestExecutorsGeneric.java (100%) rename common/common-test/src/{main => test}/java/zingg/common/core/executor/TrainerTester.java (100%) rename common/common-test/src/{main => test}/java/zingg/common/core/executor/TrainingDataFinderTester.java (100%) diff --git a/common/common-test/pom.xml b/common/common-test/pom.xml index 082f73e33..0daf23a58 100644 --- a/common/common-test/pom.xml +++ b/common/common-test/pom.xml @@ -23,16 +23,19 @@ org.junit.jupiter junit-jupiter-engine 5.8.1 + test org.junit.jupiter junit-jupiter-api 5.8.1 + test org.junit.jupiter junit-jupiter-params 5.8.1 + test diff --git a/common/common-test/src/main/java/zingg/common/core/executor/ExecutorTester.java b/common/common-test/src/test/java/zingg/common/core/executor/ExecutorTester.java similarity index 100% rename from common/common-test/src/main/java/zingg/common/core/executor/ExecutorTester.java rename to common/common-test/src/test/java/zingg/common/core/executor/ExecutorTester.java diff --git a/common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java b/common/common-test/src/test/java/zingg/common/core/executor/JunitLabeller.java similarity index 100% rename from common/common-test/src/main/java/zingg/common/core/executor/JunitLabeller.java rename to common/common-test/src/test/java/zingg/common/core/executor/JunitLabeller.java diff --git a/common/common-test/src/main/java/zingg/common/core/executor/LabellerTester.java b/common/common-test/src/test/java/zingg/common/core/executor/LabellerTester.java similarity index 100% rename from common/common-test/src/main/java/zingg/common/core/executor/LabellerTester.java rename to common/common-test/src/test/java/zingg/common/core/executor/LabellerTester.java diff --git a/common/common-test/src/main/java/zingg/common/core/executor/MatcherTester.java b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java similarity index 100% rename from common/common-test/src/main/java/zingg/common/core/executor/MatcherTester.java rename to common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java diff --git a/common/common-test/src/main/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java similarity index 100% rename from common/common-test/src/main/java/zingg/common/core/executor/TestExecutorsGeneric.java rename to common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java diff --git a/common/common-test/src/main/java/zingg/common/core/executor/TrainerTester.java b/common/common-test/src/test/java/zingg/common/core/executor/TrainerTester.java similarity index 100% rename from common/common-test/src/main/java/zingg/common/core/executor/TrainerTester.java rename to common/common-test/src/test/java/zingg/common/core/executor/TrainerTester.java diff --git a/common/common-test/src/main/java/zingg/common/core/executor/TrainingDataFinderTester.java b/common/common-test/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java similarity index 100% rename from common/common-test/src/main/java/zingg/common/core/executor/TrainingDataFinderTester.java rename to common/common-test/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java diff --git a/pom.xml b/pom.xml index 955a17ffd..db0cb1e34 100644 --- a/pom.xml +++ b/pom.xml @@ -247,6 +247,18 @@
+ + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + + test-jar + + + + diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml index b952e608c..416f17e87 100644 --- a/spark/spark-test/pom.xml +++ b/spark/spark-test/pom.xml @@ -32,22 +32,28 @@ zingg zingg-common-test + tests + test-jar ${zingg.version} + test org.junit.jupiter junit-jupiter-engine 5.8.1 + test org.junit.jupiter junit-jupiter-api 5.8.1 + test org.junit.jupiter junit-jupiter-params 5.8.1 + test From 56c47e68c4a641ae2fe90004c4cce0ce596ae7e7 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 8 Feb 2024 14:34:43 +0530 Subject: [PATCH 057/219] reduce scope of test jar production --- common/common-test/pom.xml | 16 ++++++++++++++++ pom.xml | 15 +-------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/common/common-test/pom.xml b/common/common-test/pom.xml index 0daf23a58..1c1ee7a0d 100644 --- a/common/common-test/pom.xml +++ b/common/common-test/pom.xml @@ -38,4 +38,20 @@ test + + + + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + + test-jar + + + + + + diff --git a/pom.xml b/pom.xml index db0cb1e34..cd421cc5f 100644 --- a/pom.xml +++ b/pom.xml @@ -247,19 +247,6 @@ - - org.apache.maven.plugins - maven-jar-plugin - 2.3.2 - - - - test-jar - - - - - - + From e0b9453c911ce07d34e7473c9fc54cf77390fa7e Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 8 Feb 2024 15:15:11 +0530 Subject: [PATCH 058/219] generate test jar for spark test --- spark/spark-test/pom.xml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml index 416f17e87..f612edfc7 100644 --- a/spark/spark-test/pom.xml +++ b/spark/spark-test/pom.xml @@ -56,4 +56,20 @@ test + + + + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + + test-jar + + + + + + From 75a3334fac16ef4e59d763ddce133b57dda69262 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Sat, 10 Feb 2024 23:43:03 +0530 Subject: [PATCH 059/219] Add NPE message --- .../core/src/main/java/zingg/common/core/util/DSUtil.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/common/core/src/main/java/zingg/common/core/util/DSUtil.java b/common/core/src/main/java/zingg/common/core/util/DSUtil.java index 6c6d0721b..15c4e2091 100644 --- a/common/core/src/main/java/zingg/common/core/util/DSUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/DSUtil.java @@ -43,7 +43,12 @@ public static final String[] getPrefixedColumns(String[] cols) { } public ZFrame getPrefixedColumnsDS(ZFrame lines) { - return lines.toDF(getPrefixedColumns(lines.columns())); + try { + return lines.toDF(getPrefixedColumns(lines.columns())); + } catch (Exception e) { + LOG.error("Please ensure that the 'ftd' and 'label' processes are executed before initiating the training phase"); + throw e; + } } From 20a8972e43df11756731bd756aba6b7828107d4d Mon Sep 17 00:00:00 2001 From: Gnanaprakash R <89972506+gnanaprakash-ravi@users.noreply.github.com> Date: Thu, 15 Feb 2024 19:02:25 +0530 Subject: [PATCH 060/219] Create ruleset.xml --- ruleset.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 ruleset.xml diff --git a/ruleset.xml b/ruleset.xml new file mode 100644 index 000000000..71d1534a3 --- /dev/null +++ b/ruleset.xml @@ -0,0 +1,10 @@ + + + lafaspot PMD rules. + + 1 + + 1 + + + From 1dd92130be47b718ae83a49b44d4c23e6b46c9ee Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Fri, 16 Feb 2024 18:30:22 +0530 Subject: [PATCH 061/219] Event Publish subscribe framework --- .../main/java/zingg/common/client/Client.java | 16 +++++++++ .../client/event/events/DataCountEvent.java | 6 ++++ .../common/client/event/events/IEvent.java | 14 ++++++++ .../client/event/events/ZinggStartEvent.java | 5 +++ .../client/event/events/ZinggStopEvent.java | 5 +++ .../event/listeners/EventsListener.java | 34 +++++++++++++++++++ .../event/listeners/IEventListener.java | 10 ++++++ .../event/listeners/ZinggStartListener.java | 12 +++++++ .../event/listeners/ZinggStopListener.java | 11 ++++++ 9 files changed, 113 insertions(+) create mode 100644 common/client/src/main/java/zingg/common/client/event/events/DataCountEvent.java create mode 100644 common/client/src/main/java/zingg/common/client/event/events/IEvent.java create mode 100644 common/client/src/main/java/zingg/common/client/event/events/ZinggStartEvent.java create mode 100644 common/client/src/main/java/zingg/common/client/event/events/ZinggStopEvent.java create mode 100644 common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java create mode 100644 common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java create mode 100644 common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java create mode 100644 common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index e505c4fc5..bd7e9abfe 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -5,6 +5,13 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import zingg.common.client.event.events.IEvent; +import zingg.common.client.event.events.ZinggStartEvent; +import zingg.common.client.event.events.ZinggStopEvent; +import zingg.common.client.event.listeners.EventsListener; +import zingg.common.client.event.listeners.IEventListener; +import zingg.common.client.event.listeners.ZinggStartListener; +import zingg.common.client.event.listeners.ZinggStopListener; import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.Email; import zingg.common.client.util.EmailBody; @@ -323,5 +330,14 @@ protected ArgumentsUtil getArgsUtil() { } return argsUtil; } + + public void addListener(IEvent event, IEventListener listener) { + EventsListener.getInstance().addListener(event.getClass(), listener); + } + + public void initializeListeners() { + addListener(new ZinggStartEvent(), new ZinggStartListener()); + addListener(new ZinggStopEvent(), new ZinggStopListener()); + } } \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/event/events/DataCountEvent.java b/common/client/src/main/java/zingg/common/client/event/events/DataCountEvent.java new file mode 100644 index 000000000..667364863 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/events/DataCountEvent.java @@ -0,0 +1,6 @@ +package zingg.common.client.event.events; + +public class DataCountEvent extends IEvent{ + + public static final String INPUT_DATA_COUNT = "INPUT_DATA_COUNT"; +} diff --git a/common/client/src/main/java/zingg/common/client/event/events/IEvent.java b/common/client/src/main/java/zingg/common/client/event/events/IEvent.java new file mode 100644 index 000000000..53f443972 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/events/IEvent.java @@ -0,0 +1,14 @@ +package zingg.common.client.event.events; + +import java.util.HashMap; + +public class IEvent { + + public HashMap getProps(){ + return null; + } + + public void setProps(HashMap props){ + + } +} diff --git a/common/client/src/main/java/zingg/common/client/event/events/ZinggStartEvent.java b/common/client/src/main/java/zingg/common/client/event/events/ZinggStartEvent.java new file mode 100644 index 000000000..40c15775f --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/events/ZinggStartEvent.java @@ -0,0 +1,5 @@ +package zingg.common.client.event.events; + +public class ZinggStartEvent extends IEvent{ + +} diff --git a/common/client/src/main/java/zingg/common/client/event/events/ZinggStopEvent.java b/common/client/src/main/java/zingg/common/client/event/events/ZinggStopEvent.java new file mode 100644 index 000000000..dedeb37bd --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/events/ZinggStopEvent.java @@ -0,0 +1,5 @@ +package zingg.common.client.event.events; + +public class ZinggStopEvent extends IEvent{ + +} diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java new file mode 100644 index 000000000..8ef14f44f --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java @@ -0,0 +1,34 @@ +package zingg.common.client.event.listeners; + +import zingg.common.client.event.events.IEvent; +import zingg.common.client.util.ListMap; + +public class EventsListener { + private static EventsListener eventsListener = null; + private final ListMap, IEventListener> eventListeners; + + private EventsListener() { + eventListeners = new ListMap<>(); + } + + public static EventsListener getInstance() { + if (eventsListener == null) + eventsListener = new EventsListener(); + return eventsListener; + } + + public void addListener(Class eventClass, IEventListener listener) { + eventListeners.add(eventClass, listener); + } + + public void fireEvent(IEvent event) { + listen(event); + } + + private void listen(IEvent event) { + Class eventClass = event.getClass(); + for (IEventListener listener : eventListeners.get(eventClass)) { + listener.listen(event); + } + } +} diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java new file mode 100644 index 000000000..9764ff031 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java @@ -0,0 +1,10 @@ +package zingg.common.client.event.listeners; + +import zingg.common.client.event.events.IEvent; + +public class IEventListener { + + public void listen(IEvent event){ + + } +} diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java new file mode 100644 index 000000000..5cf9fc4cd --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java @@ -0,0 +1,12 @@ +package zingg.common.client.event.listeners; + +import zingg.common.client.event.events.IEvent; + +public class ZinggStartListener extends IEventListener { + + @Override + public void listen(IEvent event) { + System.out.println("ZinggStartListener: I am listening"); + } + +} diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java new file mode 100644 index 000000000..e5611fbd1 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java @@ -0,0 +1,11 @@ +package zingg.common.client.event.listeners; + +import zingg.common.client.event.events.IEvent; + +public class ZinggStopListener extends IEventListener { + + @Override + public void listen(IEvent event) { + System.out.println("ZinggStopListener: I am listening"); + } +} From 2c7f7447670a78d23d6f045ea92d558d2bc0479b Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 18 Feb 2024 10:52:54 +0530 Subject: [PATCH 062/219] move intersect, substr and gt to ZFrame --- .../main/java/zingg/common/client/ZFrame.java | 7 ++++++- .../common/core/executor/MatcherTester.java | 16 +++------------- .../java/zingg/spark/client/SparkFrame.java | 13 +++++++++++++ .../core/executor/SparkMatcherTester.java | 18 ------------------ 4 files changed, 22 insertions(+), 32 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ZFrame.java b/common/client/src/main/java/zingg/common/client/ZFrame.java index d7c2c97ad..6ffa25a5a 100644 --- a/common/client/src/main/java/zingg/common/client/ZFrame.java +++ b/common/client/src/main/java/zingg/common/client/ZFrame.java @@ -170,5 +170,10 @@ public interface ZFrame { public ZFrame groupByCount(String groupByCol1, String groupByCol2, String countColName); - + public ZFrame intersect(ZFrame other); + + public C substr(C col, int startPos, int len); + + public C gt(C column1, C column2); + } diff --git a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java index 08485227a..b3dbc7b90 100644 --- a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java +++ b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java @@ -31,7 +31,7 @@ protected void assessAccuracy() throws ZinggClientException { df = df.withColumn("fnameId",df.concat(df.col("fname"), df.col("id"))); df = df.select("fnameId", getClusterColName()); - df = df.withColumn("dupeFnameId",substr(df.col("fnameId"),0,8)).cache(); + df = df.withColumn("dupeFnameId",df.substr(df.col("fnameId"),0,8)).cache(); ZFrame df1 = df.withColumnRenamed("fnameId", "fnameId1").withColumnRenamed("dupeFnameId", "dupeFnameId1") .withColumnRenamed(getClusterColName(), getClusterColName() + "1").cache(); @@ -40,7 +40,7 @@ protected void assessAccuracy() throws ZinggClientException { ZFrame result = joinAndFilter(getClusterColName(), df, df1).cache(); ZFrame fn = gold.except(result); - ZFrame tp = intersect(gold,result); + ZFrame tp = gold.intersect(result); ZFrame fp = result.except(gold); long fnCount = fn.count(); @@ -66,17 +66,7 @@ protected ZFrame joinAndFilter(String colName, ZFrame df, ZFra C col1 = df.col(colName); C col2 = df1.col(colName+"1"); ZFrame joined = df.joinOnCol(df1, df.equalTo(col1, col2)); - return joined.filter(gt(joined.col("fnameId"), joined.col("fnameId1"))); + return joined.filter(joined.gt(joined.col("fnameId"), joined.col("fnameId1"))); } - - // returns df1.intersect(df2) - public abstract ZFrame intersect(ZFrame df1, ZFrame df2); - - // return col.substr(startPos,len) - public abstract C substr(C col, int startPos, int len); - - // return c1.gt(c2) - public abstract C gt(C column1, C column2); - } diff --git a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java index 6de857dcf..ce20ea9a5 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java @@ -448,7 +448,20 @@ public ZFrame, Row, Column> groupByCount(String groupByCol1, String } + @Override + public ZFrame, Row, Column> intersect(ZFrame, Row, Column> other) { + return new SparkFrame(df.intersect(other.df())); + } + @Override + public Column substr(Column col, int startPos, int len) { + return col.substr(startPos, len); + } + + @Override + public Column gt(Column column1, Column column2) { + return column1.gt(column2); + } } diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java index d94691e21..759442bca 100644 --- a/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java +++ b/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java @@ -6,10 +6,8 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataType; -import zingg.common.client.ZFrame; import zingg.common.core.executor.Matcher; import zingg.common.core.executor.MatcherTester; -import zingg.spark.client.SparkFrame; public class SparkMatcherTester extends MatcherTester,Row,Column,DataType> { @@ -17,20 +15,4 @@ public SparkMatcherTester(Matcher, Row, Column, DataT super(executor); } - @Override - public ZFrame, Row, Column> intersect(ZFrame, Row, Column> df1, - ZFrame, Row, Column> df2) { - return new SparkFrame(df1.df().intersect(df2.df())); - } - - @Override - public Column substr(Column col, int startPos, int len) { - return col.substr(startPos,len); - } - - @Override - public Column gt(Column column1, Column column2) { - return column1.gt(column2); - } - } From e18399e70ba3b962bbdc2ebd7900fc75e535bfaa Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 18 Feb 2024 11:19:21 +0530 Subject: [PATCH 063/219] refactor testExecutors to base class --- .../common/core/executor/MatcherTester.java | 2 +- .../core/executor/TestExecutorsGeneric.java | 38 +++++++++++++++++ .../core/executor/SparkMatcherTester.java | 18 -------- .../core/executor/TestSparkExecutors.java | 41 ++----------------- 4 files changed, 43 insertions(+), 56 deletions(-) delete mode 100644 spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java diff --git a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java index b3dbc7b90..00838ec68 100644 --- a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java +++ b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java @@ -9,7 +9,7 @@ import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; -public abstract class MatcherTester extends ExecutorTester { +public class MatcherTester extends ExecutorTester { public static final Log LOG = LogFactory.getLog(MatcherTester.class); diff --git a/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java index aff380ab0..8e7d9bdae 100644 --- a/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java +++ b/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java @@ -1,9 +1,11 @@ package zingg.common.core.executor; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.junit.jupiter.api.Test; import zingg.common.client.ArgumentsUtil; import zingg.common.client.IArguments; @@ -42,6 +44,34 @@ public String setupArgs() throws ZinggClientException, IOException { public abstract String getConfigFile(); + + @Test + public void testExecutors() throws ZinggClientException { + List> executorTesterList = new ArrayList>(); + + TrainingDataFinderTester tdft = new TrainingDataFinderTester(getTrainingDataFinder()); + executorTesterList.add(tdft); + + LabellerTester lt = new LabellerTester(getLabeller()); + executorTesterList.add(lt); + + // training and labelling needed twice to get sufficient data + TrainingDataFinderTester tdft2 = new TrainingDataFinderTester(getTrainingDataFinder()); + executorTesterList.add(tdft2); + + LabellerTester lt2 = new LabellerTester(getLabeller()); + executorTesterList.add(lt2); + + TrainerTester tt = new TrainerTester(getTrainer()); + executorTesterList.add(tt); + + MatcherTester mt = new MatcherTester(getMatcher()); + executorTesterList.add(mt); + + testExecutors(executorTesterList); + } + + public void testExecutors(List> executorTesterList) throws ZinggClientException { for (ExecutorTester executorTester : executorTesterList) { executorTester.execute(); @@ -51,4 +81,12 @@ public void testExecutors(List> executorTesterList public abstract void tearDown(); + protected abstract TrainingDataFinder getTrainingDataFinder() throws ZinggClientException; + + protected abstract Labeller getLabeller() throws ZinggClientException; + + protected abstract Trainer getTrainer() throws ZinggClientException; + + protected abstract Matcher getMatcher() throws ZinggClientException; + } diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java deleted file mode 100644 index 759442bca..000000000 --- a/spark/spark-test/src/test/java/zingg/spark/core/executor/SparkMatcherTester.java +++ /dev/null @@ -1,18 +0,0 @@ -package zingg.spark.core.executor; - -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataType; - -import zingg.common.core.executor.Matcher; -import zingg.common.core.executor.MatcherTester; - -public class SparkMatcherTester extends MatcherTester,Row,Column,DataType> { - - public SparkMatcherTester(Matcher, Row, Column, DataType> executor) { - super(executor); - } - -} diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java index 4995e51ed..dca91dae2 100644 --- a/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java +++ b/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java @@ -2,8 +2,6 @@ import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -13,16 +11,10 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataType; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Test; import zingg.common.client.ZinggClientException; -import zingg.common.core.executor.ExecutorTester; import zingg.common.core.executor.Labeller; -import zingg.common.core.executor.LabellerTester; -import zingg.common.core.executor.MatcherTester; import zingg.common.core.executor.TestExecutorsGeneric; -import zingg.common.core.executor.TrainerTester; -import zingg.common.core.executor.TrainingDataFinderTester; import zingg.spark.core.context.ZinggSparkContext; public class TestSparkExecutors extends TestExecutorsGeneric,Row,Column,DataType> { @@ -52,50 +44,25 @@ public String getConfigFile() { return CONFIG_FILE; } - @Test - public void testExecutors() throws ZinggClientException { - List,Row,Column,DataType>> executorTesterList = new ArrayList,Row,Column,DataType>>(); - - TrainingDataFinderTester,Row,Column,DataType> tdft = new TrainingDataFinderTester,Row,Column,DataType>(getTrainingDataFinder()); - executorTesterList.add(tdft); - - LabellerTester,Row,Column,DataType> lt = new LabellerTester,Row,Column,DataType>(getLabeller()); - executorTesterList.add(lt); - - // training and labelling needed twice to get sufficient data - TrainingDataFinderTester,Row,Column,DataType> tdft2 = new TrainingDataFinderTester,Row,Column,DataType>(getTrainingDataFinder()); - executorTesterList.add(tdft2); - - LabellerTester,Row,Column,DataType> lt2 = new LabellerTester,Row,Column,DataType>(getLabeller()); - executorTesterList.add(lt2); - - TrainerTester,Row,Column,DataType> tt = new TrainerTester,Row,Column,DataType>(getTrainer()); - executorTesterList.add(tt); - - MatcherTester,Row,Column,DataType> mt = new SparkMatcherTester(getMatcher()); - executorTesterList.add(mt); - - super.testExecutors(executorTesterList); - } - + @Override protected SparkTrainingDataFinder getTrainingDataFinder() throws ZinggClientException { SparkTrainingDataFinder stdf = new SparkTrainingDataFinder(ctx); stdf.init(args); return stdf; } - + @Override protected Labeller,Row,Column,DataType> getLabeller() throws ZinggClientException { JunitSparkLabeller jlbl = new JunitSparkLabeller(ctx); jlbl.init(args); return jlbl; } - + @Override protected SparkTrainer getTrainer() throws ZinggClientException { SparkTrainer st = new SparkTrainer(ctx); st.init(args); return st; } - + @Override protected SparkMatcher getMatcher() throws ZinggClientException { SparkMatcher sm = new SparkMatcher(ctx); sm.init(args); From 6423793d37b46a5e24ff5426df4315dae2189289 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 18 Feb 2024 13:41:34 +0530 Subject: [PATCH 064/219] refactor init to base class --- .../common/core/executor/MatcherTester.java | 4 ++-- .../core/executor/TestExecutorsGeneric.java | 24 ++++++++++++++----- .../core/executor/TestSparkExecutors.java | 4 ---- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java index 00838ec68..4d4b7cfc6 100644 --- a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java +++ b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java @@ -53,8 +53,8 @@ protected void assessAccuracy() throws ZinggClientException { LOG.info("precision " + (tpCount*1.0d/(tpCount+fpCount))); LOG.info("recall " + tpCount + " denom " + (tpCount+fnCount) + " overall " + (tpCount*1.0d/(tpCount+fnCount))); - assertTrue(0.8 < (tpCount*1.0d/(tpCount+fpCount))); - assertTrue(0.8 < (tpCount*1.0d/(tpCount+fnCount))); + assertTrue(0.8 < Math.round(tpCount*1.0d/(tpCount+fpCount))); + assertTrue(0.8 < Math.round(tpCount*1.0d/(tpCount+fnCount))); } public ZFrame getOutputData() throws ZinggClientException { diff --git a/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java index 8e7d9bdae..424eec10c 100644 --- a/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java +++ b/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java @@ -49,23 +49,35 @@ public String setupArgs() throws ZinggClientException, IOException { public void testExecutors() throws ZinggClientException { List> executorTesterList = new ArrayList>(); - TrainingDataFinderTester tdft = new TrainingDataFinderTester(getTrainingDataFinder()); + TrainingDataFinder trainingDataFinder = getTrainingDataFinder(); + trainingDataFinder.init(args); + TrainingDataFinderTester tdft = new TrainingDataFinderTester(trainingDataFinder); executorTesterList.add(tdft); - LabellerTester lt = new LabellerTester(getLabeller()); + Labeller labeller = getLabeller(); + labeller.init(args); + LabellerTester lt = new LabellerTester(labeller); executorTesterList.add(lt); // training and labelling needed twice to get sufficient data - TrainingDataFinderTester tdft2 = new TrainingDataFinderTester(getTrainingDataFinder()); + TrainingDataFinder trainingDataFinder2 = getTrainingDataFinder(); + trainingDataFinder2.init(args); + TrainingDataFinderTester tdft2 = new TrainingDataFinderTester(trainingDataFinder2); executorTesterList.add(tdft2); - LabellerTester lt2 = new LabellerTester(getLabeller()); + Labeller labeller2 = getLabeller(); + labeller2.init(args); + LabellerTester lt2 = new LabellerTester(labeller2); executorTesterList.add(lt2); - TrainerTester tt = new TrainerTester(getTrainer()); + Trainer trainer = getTrainer(); + trainer.init(args); + TrainerTester tt = new TrainerTester(trainer); executorTesterList.add(tt); - MatcherTester mt = new MatcherTester(getMatcher()); + Matcher matcher = getMatcher(); + matcher.init(args); + MatcherTester mt = new MatcherTester(matcher); executorTesterList.add(mt); testExecutors(executorTesterList); diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java index dca91dae2..8128fdc1a 100644 --- a/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java +++ b/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java @@ -47,25 +47,21 @@ public String getConfigFile() { @Override protected SparkTrainingDataFinder getTrainingDataFinder() throws ZinggClientException { SparkTrainingDataFinder stdf = new SparkTrainingDataFinder(ctx); - stdf.init(args); return stdf; } @Override protected Labeller,Row,Column,DataType> getLabeller() throws ZinggClientException { JunitSparkLabeller jlbl = new JunitSparkLabeller(ctx); - jlbl.init(args); return jlbl; } @Override protected SparkTrainer getTrainer() throws ZinggClientException { SparkTrainer st = new SparkTrainer(ctx); - st.init(args); return st; } @Override protected SparkMatcher getMatcher() throws ZinggClientException { SparkMatcher sm = new SparkMatcher(ctx); - sm.init(args); return sm; } From 227764f5ce1c07f5d34a8dab0fc2914820c2485c Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 18 Feb 2024 16:42:21 +0530 Subject: [PATCH 065/219] round not needed --- .../common/core/executor/MatcherTester.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java index 4d4b7cfc6..24500fe3f 100644 --- a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java +++ b/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java @@ -46,15 +46,21 @@ protected void assessAccuracy() throws ZinggClientException { long fnCount = fn.count(); long tpCount = tp.count(); long fpCount = fp.count(); - + double score1 = tpCount*1.0d/(tpCount+fpCount); + double score2 = tpCount*1.0d/(tpCount+fnCount); + LOG.info("False negative " + fnCount); LOG.info("True positive " + tpCount); LOG.info("False positive " + fpCount); - LOG.info("precision " + (tpCount*1.0d/(tpCount+fpCount))); - LOG.info("recall " + tpCount + " denom " + (tpCount+fnCount) + " overall " + (tpCount*1.0d/(tpCount+fnCount))); + LOG.info("precision " + score1); + LOG.info("recall " + tpCount + " denom " + (tpCount+fnCount) + " overall " + score2); - assertTrue(0.8 < Math.round(tpCount*1.0d/(tpCount+fpCount))); - assertTrue(0.8 < Math.round(tpCount*1.0d/(tpCount+fnCount))); + System.out.println("precision score1 " + score1); + + System.out.println("recall score2 " + score2); + + assertTrue(0.8 <= score1); + assertTrue(0.8 <= score2); } public ZFrame getOutputData() throws ZinggClientException { From 3a79946b77ceb99f899a22b2e17e8ff5c80b940f Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 18 Feb 2024 17:05:46 +0530 Subject: [PATCH 066/219] moved to common test --- .../src/test/java/zingg/common/core/executor/ExecutorTester.java | 0 .../src/test/java/zingg/common/core/executor/JunitLabeller.java | 0 .../src/test/java/zingg/common/core/executor/LabellerTester.java | 0 .../src/test/java/zingg/common/core/executor/MatcherTester.java | 0 .../java/zingg/common/core/executor/TestExecutorsGeneric.java | 0 .../src/test/java/zingg/common/core/executor/TrainerTester.java | 0 .../java/zingg/common/core/executor/TrainingDataFinderTester.java | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename common/{common-test => core}/src/test/java/zingg/common/core/executor/ExecutorTester.java (100%) rename common/{common-test => core}/src/test/java/zingg/common/core/executor/JunitLabeller.java (100%) rename common/{common-test => core}/src/test/java/zingg/common/core/executor/LabellerTester.java (100%) rename common/{common-test => core}/src/test/java/zingg/common/core/executor/MatcherTester.java (100%) rename common/{common-test => core}/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java (100%) rename common/{common-test => core}/src/test/java/zingg/common/core/executor/TrainerTester.java (100%) rename common/{common-test => core}/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java (100%) diff --git a/common/common-test/src/test/java/zingg/common/core/executor/ExecutorTester.java b/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java similarity index 100% rename from common/common-test/src/test/java/zingg/common/core/executor/ExecutorTester.java rename to common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java diff --git a/common/common-test/src/test/java/zingg/common/core/executor/JunitLabeller.java b/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java similarity index 100% rename from common/common-test/src/test/java/zingg/common/core/executor/JunitLabeller.java rename to common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java diff --git a/common/common-test/src/test/java/zingg/common/core/executor/LabellerTester.java b/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java similarity index 100% rename from common/common-test/src/test/java/zingg/common/core/executor/LabellerTester.java rename to common/core/src/test/java/zingg/common/core/executor/LabellerTester.java diff --git a/common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java similarity index 100% rename from common/common-test/src/test/java/zingg/common/core/executor/MatcherTester.java rename to common/core/src/test/java/zingg/common/core/executor/MatcherTester.java diff --git a/common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java similarity index 100% rename from common/common-test/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java rename to common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java diff --git a/common/common-test/src/test/java/zingg/common/core/executor/TrainerTester.java b/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java similarity index 100% rename from common/common-test/src/test/java/zingg/common/core/executor/TrainerTester.java rename to common/core/src/test/java/zingg/common/core/executor/TrainerTester.java diff --git a/common/common-test/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java b/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java similarity index 100% rename from common/common-test/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java rename to common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java From 2184a845503260f3618339c31b90e729ccf13ae6 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 18 Feb 2024 17:11:44 +0530 Subject: [PATCH 067/219] move tests to common tests --- common/core/pom.xml | 34 ++++++++++++++++++++++++++++++++++ spark/spark-test/pom.xml | 8 ++++++++ 2 files changed, 42 insertions(+) diff --git a/common/core/pom.xml b/common/core/pom.xml index 1f03d694c..40d61e0c4 100644 --- a/common/core/pom.xml +++ b/common/core/pom.xml @@ -29,5 +29,39 @@ httpclient 4.5.14 + + org.junit.jupiter + junit-jupiter-engine + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-api + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-params + 5.8.1 + test + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + + test-jar + + + + + + diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml index f612edfc7..2d6289781 100644 --- a/spark/spark-test/pom.xml +++ b/spark/spark-test/pom.xml @@ -29,6 +29,14 @@ zingg-common-client ${zingg.version} + + zingg + zingg-common-core + tests + test-jar + ${zingg.version} + test + zingg zingg-common-test From 6b97885ab4e71f085891eebcb3f9a2d9141c510e Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 18 Feb 2024 17:21:51 +0530 Subject: [PATCH 068/219] removed module common-test --- common/common-test/pom.xml | 57 -------------------------------------- common/pom.xml | 1 - spark/spark-test/pom.xml | 8 ------ 3 files changed, 66 deletions(-) delete mode 100644 common/common-test/pom.xml diff --git a/common/common-test/pom.xml b/common/common-test/pom.xml deleted file mode 100644 index 1c1ee7a0d..000000000 --- a/common/common-test/pom.xml +++ /dev/null @@ -1,57 +0,0 @@ - - 4.0.0 - - zingg - zingg-common - ${zingg.version} - - zingg-common-test - jar - - - zingg - zingg-common-core - ${zingg.version} - - - zingg - zingg-common-client - ${zingg.version} - - - org.junit.jupiter - junit-jupiter-engine - 5.8.1 - test - - - org.junit.jupiter - junit-jupiter-api - 5.8.1 - test - - - org.junit.jupiter - junit-jupiter-params - 5.8.1 - test - - - - - - org.apache.maven.plugins - maven-jar-plugin - 2.3.2 - - - - test-jar - - - - - - - diff --git a/common/pom.xml b/common/pom.xml index a1cade2bb..c50c2b037 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -12,7 +12,6 @@ infra core client - common-test py diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml index 2d6289781..54c2255b9 100644 --- a/spark/spark-test/pom.xml +++ b/spark/spark-test/pom.xml @@ -37,14 +37,6 @@ ${zingg.version} test - - zingg - zingg-common-test - tests - test-jar - ${zingg.version} - test - org.junit.jupiter junit-jupiter-engine From 3885cdf68e4345e326607a5aafc8cea1749608c9 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 19 Feb 2024 16:42:01 +0530 Subject: [PATCH 069/219] throws ZinggClientException added --- .../zingg/common/client/event/listeners/EventsListener.java | 5 +++-- .../zingg/common/client/event/listeners/IEventListener.java | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java index 8ef14f44f..3483b7f8d 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java @@ -1,5 +1,6 @@ package zingg.common.client.event.listeners; +import zingg.common.client.ZinggClientException; import zingg.common.client.event.events.IEvent; import zingg.common.client.util.ListMap; @@ -21,11 +22,11 @@ public void addListener(Class eventClass, IEventListener liste eventListeners.add(eventClass, listener); } - public void fireEvent(IEvent event) { + public void fireEvent(IEvent event) throws ZinggClientException { listen(event); } - private void listen(IEvent event) { + private void listen(IEvent event) throws ZinggClientException { Class eventClass = event.getClass(); for (IEventListener listener : eventListeners.get(eventClass)) { listener.listen(event); diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java index 9764ff031..756eea766 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java @@ -1,10 +1,11 @@ package zingg.common.client.event.listeners; +import zingg.common.client.ZinggClientException; import zingg.common.client.event.events.IEvent; public class IEventListener { - public void listen(IEvent event){ + public void listen(IEvent event) throws ZinggClientException { } } From 6fd7c45f823b7a47cac8db88118ca700c0768f21 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 19 Feb 2024 17:33:25 +0530 Subject: [PATCH 070/219] moved to spark core --- .../test/java/zingg/common}/core/executor/JunitSparkLabeller.java | 0 .../test/java/zingg/common}/core/executor/TestSparkExecutors.java | 0 .../resources/zingg/spark/core/executor/configSparkIntTest.json | 0 .../src/test/resources/zingg/spark/core/executor/test.csv | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename spark/{spark-test/src/test/java/zingg/spark => core/src/test/java/zingg/common}/core/executor/JunitSparkLabeller.java (100%) rename spark/{spark-test/src/test/java/zingg/spark => core/src/test/java/zingg/common}/core/executor/TestSparkExecutors.java (100%) rename spark/{spark-test => core}/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json (100%) rename spark/{spark-test => core}/src/test/resources/zingg/spark/core/executor/test.csv (100%) diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java b/spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java similarity index 100% rename from spark/spark-test/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java rename to spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java similarity index 100% rename from spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java rename to spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java diff --git a/spark/spark-test/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json b/spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json similarity index 100% rename from spark/spark-test/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json rename to spark/core/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json diff --git a/spark/spark-test/src/test/resources/zingg/spark/core/executor/test.csv b/spark/core/src/test/resources/zingg/spark/core/executor/test.csv similarity index 100% rename from spark/spark-test/src/test/resources/zingg/spark/core/executor/test.csv rename to spark/core/src/test/resources/zingg/spark/core/executor/test.csv From 7dbe1b31e84763d9fd60d7a264105fe1f08a563c Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 19 Feb 2024 17:35:45 +0530 Subject: [PATCH 071/219] moved to spark core and removed spark-test module --- spark/core/pom.xml | 38 ++++++++++ .../core/executor/JunitSparkLabeller.java | 3 +- .../core/executor/TestSparkExecutors.java | 5 +- spark/pom.xml | 1 - spark/spark-test/pom.xml | 75 ------------------- 5 files changed, 44 insertions(+), 78 deletions(-) delete mode 100644 spark/spark-test/pom.xml diff --git a/spark/core/pom.xml b/spark/core/pom.xml index 3674129b2..bbc568fb0 100644 --- a/spark/core/pom.xml +++ b/spark/core/pom.xml @@ -24,6 +24,32 @@ zingg-common-client ${zingg.version} + + zingg + zingg-common-core + tests + test-jar + ${zingg.version} + test + + + org.junit.jupiter + junit-jupiter-engine + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-api + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-params + 5.8.1 + test + @@ -54,6 +80,18 @@ ${scala.version} + + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + + test-jar + + + + diff --git a/spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java b/spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java index ba1ed9372..6b010f222 100644 --- a/spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java +++ b/spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java @@ -1,4 +1,4 @@ -package zingg.spark.core.executor; +package zingg.common.core.executor; import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; @@ -12,6 +12,7 @@ import zingg.common.client.options.ZinggOptions; import zingg.common.core.executor.JunitLabeller; import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.executor.SparkLabeller; public class JunitSparkLabeller extends SparkLabeller { diff --git a/spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java b/spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java index 8128fdc1a..e853f76e1 100644 --- a/spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java +++ b/spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java @@ -1,4 +1,4 @@ -package zingg.spark.core.executor; +package zingg.common.core.executor; import java.io.File; import java.io.IOException; @@ -16,6 +16,9 @@ import zingg.common.core.executor.Labeller; import zingg.common.core.executor.TestExecutorsGeneric; import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.executor.SparkMatcher; +import zingg.spark.core.executor.SparkTrainer; +import zingg.spark.core.executor.SparkTrainingDataFinder; public class TestSparkExecutors extends TestExecutorsGeneric,Row,Column,DataType> { protected static final String CONFIG_FILE = "zingg/spark/core/executor/configSparkIntTest.json"; diff --git a/spark/pom.xml b/spark/pom.xml index 3f7444b54..2ea784073 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -11,7 +11,6 @@ core client - spark-test diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml deleted file mode 100644 index 54c2255b9..000000000 --- a/spark/spark-test/pom.xml +++ /dev/null @@ -1,75 +0,0 @@ - - 4.0.0 - - zingg - zingg-spark - ${zingg.version} - - zingg-spark-test - jar - - - zingg - zingg-spark-core - ${zingg.version} - - - zingg - zingg-spark-client - ${zingg.version} - - - zingg - zingg-common-core - ${zingg.version} - - - zingg - zingg-common-client - ${zingg.version} - - - zingg - zingg-common-core - tests - test-jar - ${zingg.version} - test - - - org.junit.jupiter - junit-jupiter-engine - 5.8.1 - test - - - org.junit.jupiter - junit-jupiter-api - 5.8.1 - test - - - org.junit.jupiter - junit-jupiter-params - 5.8.1 - test - - - - - - org.apache.maven.plugins - maven-jar-plugin - 2.3.2 - - - - test-jar - - - - - - - From 21d39be91bcac2ae2fe38feadca51a418245fdc5 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 19 Feb 2024 17:49:31 +0530 Subject: [PATCH 072/219] moved to spark package --- .../zingg/{common => spark}/core/executor/JunitSparkLabeller.java | 0 .../zingg/{common => spark}/core/executor/TestSparkExecutors.java | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename spark/core/src/test/java/zingg/{common => spark}/core/executor/JunitSparkLabeller.java (100%) rename spark/core/src/test/java/zingg/{common => spark}/core/executor/TestSparkExecutors.java (100%) diff --git a/spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java b/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java similarity index 100% rename from spark/core/src/test/java/zingg/common/core/executor/JunitSparkLabeller.java rename to spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java diff --git a/spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java b/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java similarity index 100% rename from spark/core/src/test/java/zingg/common/core/executor/TestSparkExecutors.java rename to spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java From 4952e66897cd3d3cbfeba9e99f76b2bafcbb6096 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 19 Feb 2024 17:50:04 +0530 Subject: [PATCH 073/219] moved to spark package --- .../java/zingg/spark/core/executor/JunitSparkLabeller.java | 3 +-- .../java/zingg/spark/core/executor/TestSparkExecutors.java | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java b/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java index 6b010f222..ba1ed9372 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java @@ -1,4 +1,4 @@ -package zingg.common.core.executor; +package zingg.spark.core.executor; import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; @@ -12,7 +12,6 @@ import zingg.common.client.options.ZinggOptions; import zingg.common.core.executor.JunitLabeller; import zingg.spark.core.context.ZinggSparkContext; -import zingg.spark.core.executor.SparkLabeller; public class JunitSparkLabeller extends SparkLabeller { diff --git a/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java index e853f76e1..8128fdc1a 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java @@ -1,4 +1,4 @@ -package zingg.common.core.executor; +package zingg.spark.core.executor; import java.io.File; import java.io.IOException; @@ -16,9 +16,6 @@ import zingg.common.core.executor.Labeller; import zingg.common.core.executor.TestExecutorsGeneric; import zingg.spark.core.context.ZinggSparkContext; -import zingg.spark.core.executor.SparkMatcher; -import zingg.spark.core.executor.SparkTrainer; -import zingg.spark.core.executor.SparkTrainingDataFinder; public class TestSparkExecutors extends TestExecutorsGeneric,Row,Column,DataType> { protected static final String CONFIG_FILE = "zingg/spark/core/executor/configSparkIntTest.json"; From 081aab9d4f5d253fb3faebde8d6f679e1bcf57ba Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Tue, 20 Feb 2024 14:32:46 +0530 Subject: [PATCH 074/219] Python code generation using annotations --- .../java/zingg/common/client/Arguments.java | 2 +- .../zingg/common/client/FieldDefinition.java | 2 +- .../java/zingg/common/client/pipe/Pipe.java | 2 +- .../common/py/annotations/PythonClass.java | 6 +- .../common/py/annotations/PythonMethod.java | 2 - .../py/processors/PythonClassProcessor.java | 108 ++++++++++-------- .../py/processors/PythonMethodProcessor.java | 52 +-------- python/zingg/FieldDefinitionGenerated.py | 37 ------ .../client.py} | 37 ++++++ .../pipes.py} | 0 10 files changed, 108 insertions(+), 140 deletions(-) delete mode 100644 python/zingg/FieldDefinitionGenerated.py rename python/{zingg/ArgumentsGenerated.py => zinggGenerated/client.py} (80%) rename python/{zingg/PipeGenerated.py => zinggGenerated/pipes.py} (100%) diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index 5116a5cd9..5d48aad9d 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -81,7 +81,7 @@ * } * */ -@PythonClass +@PythonClass(module = "client") @JsonInclude(Include.NON_NULL) public class Arguments implements Serializable, IArguments { diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index f850fce8a..93f58c837 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -33,7 +33,7 @@ * @author sgoyal * */ -@PythonClass +@PythonClass(module = "client") public class FieldDefinition implements Serializable { diff --git a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java index aab0878b1..81df7c8ff 100644 --- a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java +++ b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java @@ -20,7 +20,7 @@ * @author sgoyal * */ -@PythonClass +@PythonClass(module = "pipes") @JsonInclude(Include.NON_NULL) public class Pipe implements Serializable{ // St:StructType, Sv:SaveMode diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java index 0d3bf21a5..6ef4f208e 100644 --- a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java +++ b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java @@ -1,9 +1,9 @@ package zingg.common.py.annotations; -import javax.annotation.processing.*; - import java.lang.annotation.Target; import java.lang.annotation.ElementType; @Target({ElementType.TYPE}) -public @interface PythonClass {} \ No newline at end of file +public @interface PythonClass { + String module(); +} \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java b/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java index f59a9c038..a37807d90 100644 --- a/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java +++ b/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java @@ -1,7 +1,5 @@ package zingg.common.py.annotations; -import javax.annotation.processing.*; - import java.lang.annotation.Target; import java.lang.annotation.ElementType; diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 17bd7bdba..3193f3b6a 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -3,10 +3,8 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; +import java.util.HashSet; import java.util.List; -import java.util.Map; import javax.annotation.processing.*; import java.util.Set; @@ -20,55 +18,80 @@ @SupportedAnnotationTypes("zingg.common.py.annotations.PythonClass") public class PythonClassProcessor extends AbstractProcessor { - private Map> classMethodsMap = new HashMap<>(); + private Set processedElements = new HashSet<>(); @Override public synchronized void init(ProcessingEnvironment processingEnv) { + System.out.println("ProcessingEnv " + processingEnv); super.init(processingEnv); + + // Clear the output directory on initialization + String outputDirectory = "python/zinggGenerated"; + File dir = new File(outputDirectory); + if (!dir.exists()) { + dir.mkdirs(); + } else { + for (File file : dir.listFiles()) { + file.delete(); + } + } + } @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { - - // process Services annotation + // Process each PythonClass annotated element for (Element element : roundEnv.getElementsAnnotatedWith(PythonClass.class)) { - if (element.getKind() == ElementKind.CLASS) { - TypeElement classElement = (TypeElement) element; - PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); - String packageName = packageElement.getQualifiedName().toString(); - List methodNames = new ArrayList<>(); - - String outputDirectory = determineOutputDirectory(packageName); - - try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) { - generateImportsAndDeclarations(element, fileWriter); - - fileWriter.write("class " + element.getSimpleName() + ":\n"); - - // __init__ method - fileWriter.write(" def __init__(self" + generateConstructorParameters(classElement, element) + "):\n"); - generateClassInitializationCode(classElement, element, fileWriter); - - for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { - if (methodElement.getAnnotation(PythonMethod.class) != null) { - methodNames.add(methodElement.getSimpleName().toString()); - } - } - classMethodsMap.put(element.getSimpleName().toString(), methodNames); - } catch (IOException e) { - e.printStackTrace(); - } + if (element.getKind() == ElementKind.CLASS && !processedElements.contains(element)) { + processClass((TypeElement) element, roundEnv); } } - ProcessorContext processorContext = ProcessorContext.getInstance(); - processorContext.getClassMethodsMap().putAll(classMethodsMap); - return false; } - Map> getClassMethodsMap() { - return classMethodsMap; + + private void processClass(TypeElement classElement, RoundEnvironment roundEnv) { + + // Mark the class as processed + processedElements.add(classElement); + + // System.out.println("Called for " + classElement); + PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); + String packageName = packageElement.getQualifiedName().toString(); + PythonClass pythonClassAnnotation = classElement.getAnnotation(PythonClass.class); + + String outputDirectory = determineOutputDirectory(packageName); + String moduleName = pythonClassAnnotation.module(); + String outputFile = outputDirectory + File.separator + moduleName + ".py"; + + try (FileWriter fileWriter = new FileWriter(outputFile, true)) { + generateImportsAndDeclarations(classElement, fileWriter); + + fileWriter.write("class " + classElement.getSimpleName() + ":\n"); + + // __init__ method + fileWriter.write(" def __init__(self" + generateConstructorParameters(classElement, classElement) + "):\n"); + generateClassInitializationCode(classElement, classElement, fileWriter); + + for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { + if (methodElement.getAnnotation(PythonMethod.class) != null) { + String javadoc = processingEnv.getElementUtils().getDocComment(methodElement); + if (javadoc != null) { + fileWriter.write(" '''\n"); + fileWriter.write(javadoc.trim()); + fileWriter.write("\n '''\n"); + } + + fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + PythonMethodProcessor.generateMethodSignature(methodElement) + "):\n"); + PythonMethodProcessor.generateMethodReturn(methodElement, fileWriter); + PythonMethodProcessor.generateFieldAssignment(methodElement, fileWriter); + fileWriter.write("\n"); + } + } + } catch (IOException e) { + e.printStackTrace(); + } } private String determineOutputDirectory(String packageName) { @@ -79,7 +102,7 @@ private String determineOutputDirectory(String packageName) { } else if (packageName.contains("enterprise") && packageName.contains("spark")) { return "spark/python"; } else { - return "python/zingg"; + return "python/zinggGenerated"; } } @@ -132,15 +155,6 @@ else if (element.getSimpleName().contentEquals("FieldDefinition")) { fileWriter.write("\n"); } - // private void generateFieldInitializationCode(VariableElement field, Element element) { - // String fieldName = field.getSimpleName().toString(); - // String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; - - // if (!fieldName.startsWith("FORMAT_")) { - // System.out.println(" " + fieldAssignment); - // } - // } - private String generateConstructorParameters(TypeElement classElement, Element element) { StringBuilder parameters = new StringBuilder(); diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 1971adcb2..7781edb6d 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -1,10 +1,8 @@ package zingg.common.py.processors; -import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.List; -import java.util.Map; import javax.annotation.processing.*; import javax.lang.model.type.TypeMirror; @@ -12,59 +10,22 @@ import java.util.Set; import javax.lang.model.element.*; -import zingg.common.py.annotations.*; @SupportedAnnotationTypes("zingg.common.py.annotations.PythonMethod") public class PythonMethodProcessor extends AbstractProcessor { - - private Map> classMethodsMap; @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { - - ProcessorContext processorContext = ProcessorContext.getInstance(); - classMethodsMap = processorContext.getClassMethodsMap(); - - for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { - - if (element.getKind() == ElementKind.METHOD) { - ExecutableElement methodElement = (ExecutableElement) element; - String className = methodElement.getEnclosingElement().getSimpleName().toString(); - - if (classMethodsMap.containsKey(className)) { - List methodNames = classMethodsMap.get(className); - - if (methodNames.contains(methodElement.getSimpleName().toString())) { - try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) { - - String javadoc = processingEnv.getElementUtils().getDocComment(methodElement); - if (javadoc != null) { - fileWriter.write(" '''\n"); - fileWriter.write(javadoc.trim()); - fileWriter.write("\n '''\n"); - } - - fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n"); - generateMethodReturn(methodElement, fileWriter); - generateFieldAssignment(methodElement, fileWriter); - fileWriter.write("\n"); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - } - } return false; } - private String generateMethodSignature(ExecutableElement methodElement) { + public static String generateMethodSignature(ExecutableElement methodElement) { StringBuilder signature = new StringBuilder(); signature.append(generateMethodParameters(methodElement)); return signature.toString(); } - private String generateMethodParameters(ExecutableElement methodElement) { + public static String generateMethodParameters(ExecutableElement methodElement) { StringBuilder parameters = new StringBuilder(); for (VariableElement parameter : methodElement.getParameters()) { parameters.append(", "); @@ -73,23 +34,18 @@ private String generateMethodParameters(ExecutableElement methodElement) { return parameters.toString(); } - private void generateMethodReturn(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { + public static void generateMethodReturn(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { TypeMirror returnType = methodElement.getReturnType(); if (returnType.getKind() == TypeKind.VOID) { return; } else { - String returnTypeString = resolveType(returnType); String methodName = methodElement.getSimpleName().toString(); String className = methodElement.getEnclosingElement().getSimpleName().toString(); fileWriter.write(" return self." + className.toLowerCase() + "." + methodName + "()\n"); } } - private String resolveType(TypeMirror typeMirror) { - return typeMirror.toString(); - } - - private void generateFieldAssignment(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { + public static void generateFieldAssignment(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { List parameters = methodElement.getParameters(); if (!parameters.isEmpty()) { diff --git a/python/zingg/FieldDefinitionGenerated.py b/python/zingg/FieldDefinitionGenerated.py deleted file mode 100644 index b08d75984..000000000 --- a/python/zingg/FieldDefinitionGenerated.py +++ /dev/null @@ -1,37 +0,0 @@ -from zingg.otherThanGenerated import * -''' -This class defines each field that we use in matching We can use this to - configure the properties of each field we use for matching in Zingg. - - @author sgoyal -''' -class FieldDefinition: - def __init__(self, name, dataType, *matchType): - self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() - self.fielddefinition.setFieldName(name) - self.fielddefinition.setDataType(self.stringify(dataType)) - self.fielddefinition.setMatchType(matchType) - self.fielddefinition.setFields(name) - - def getFieldDefinition(self): - return self.fielddefinition - - def setFields(self, fields): - self.fielddefinition.setFields(fields) - - ''' -Set the field type which defines the kind of matching we want to do - - @see MatchType - @param type - the type to set - ''' - def setMatchType(self, type): - self.fielddefinition.setMatchType(type) - - def setStopWords(self, stopWords): - self.fielddefinition.setStopWords(stopWords) - - def setFieldName(self, fieldName): - self.fielddefinition.setFieldName(fieldName) - diff --git a/python/zingg/ArgumentsGenerated.py b/python/zinggGenerated/client.py similarity index 80% rename from python/zingg/ArgumentsGenerated.py rename to python/zinggGenerated/client.py index bafb8d96d..5045a8ca2 100644 --- a/python/zingg/ArgumentsGenerated.py +++ b/python/zinggGenerated/client.py @@ -1,3 +1,40 @@ +from zingg.otherThanGenerated import * +''' +This class defines each field that we use in matching We can use this to + configure the properties of each field we use for matching in Zingg. + + @author sgoyal +''' +class FieldDefinition: + def __init__(self, name, dataType, *matchType): + self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() + self.fielddefinition.setFieldName(name) + self.fielddefinition.setDataType(self.stringify(dataType)) + self.fielddefinition.setMatchType(matchType) + self.fielddefinition.setFields(name) + + def getFieldDefinition(self): + return self.fielddefinition + + def setFields(self, fields): + self.fielddefinition.setFields(fields) + + ''' +Set the field type which defines the kind of matching we want to do + + @see MatchType + @param type + the type to set + ''' + def setMatchType(self, type): + self.fielddefinition.setMatchType(type) + + def setStopWords(self, stopWords): + self.fielddefinition.setStopWords(stopWords) + + def setFieldName(self, fieldName): + self.fielddefinition.setFieldName(fieldName) + from zingg.otherThanGenerated import * ''' This class helps supply match arguments to Zingg. There are 3 basic steps diff --git a/python/zingg/PipeGenerated.py b/python/zinggGenerated/pipes.py similarity index 100% rename from python/zingg/PipeGenerated.py rename to python/zinggGenerated/pipes.py From 228b19c5204f00946b600c10f283801f2536fedc Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 20 Feb 2024 14:45:15 +0530 Subject: [PATCH 075/219] initialize listners and fire event --- common/client/src/main/java/zingg/common/client/Client.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index bd7e9abfe..e9ba29f26 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -185,10 +185,12 @@ public void printAnalyticsBanner(boolean collectMetrics) { public abstract Client getClient(IArguments args, ClientOptions options) throws ZinggClientException; public void mainMethod(String... args) { + initializeListeners(); printBanner(); Client client = null; ClientOptions options = null; try { + EventsListener.getInstance().fireEvent(new ZinggStartEvent()); for (String a: args) LOG.debug("args " + a); options = new ClientOptions(args); setOptions(options); From cadaa5fcfe09ee3c8aa68420bfa09cbde787cbea Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 21 Feb 2024 10:12:05 +0530 Subject: [PATCH 076/219] Annotations tested in GeneratedFebrlExample.py --- examples/febrl/GeneratedFebrlExample.py | 7 +++---- python/MANIFEST.in | 1 + python/setup.py | 10 +++++++++- python/zingg/otherThanGeneratedArguments.py | 2 +- python/zingg/otherThanGeneratedFieldDefinition.py | 2 +- python/zingg/otherThanGeneratedPipe.py | 2 +- python/{zingg => zinggOld}/client.py | 0 python/{zingg => zinggOld}/pipes.py | 0 8 files changed, 16 insertions(+), 8 deletions(-) rename python/{zingg => zinggOld}/client.py (100%) rename python/{zingg => zinggOld}/pipes.py (100%) diff --git a/examples/febrl/GeneratedFebrlExample.py b/examples/febrl/GeneratedFebrlExample.py index 54c64e77e..e667c2429 100644 --- a/examples/febrl/GeneratedFebrlExample.py +++ b/examples/febrl/GeneratedFebrlExample.py @@ -1,6 +1,5 @@ -from zingg.ArgumentsGenerated import * -from zingg.FieldDefinitionGenerated import * -from zingg.PipeGenerated import * +from zingg.zinggGenerated.client import * +from zingg.zinggGenerated.pipes import * from zingg.otherThanGenerated import * from zingg.otherThanGeneratedPipe import * from zingg.otherThanGeneratedArguments import * @@ -39,7 +38,7 @@ args.setOutput(outputPipe) -options = ClientOptions([ClientOptions.PHASE,"findTrainingData"]) +options = ClientOptions([ClientOptions.PHASE,"match"]) #Zingg execution for the given phase zingg = Zingg(args, options) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index b9582aea0..4caa178c5 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -11,4 +11,5 @@ recursive-include zingg/examples/amazon-google * recursive-include zingg/examples/febrl * recursive-include zingg/models * recursive-include zingg/phases *.py +recursive-include zingg/zinggGenerated *.py recursive-include zingg/config * diff --git a/python/setup.py b/python/setup.py index 514c2e180..3be6a72ff 100644 --- a/python/setup.py +++ b/python/setup.py @@ -56,6 +56,7 @@ DATA_PATH = os.path.join(ZINGG_HOME, "models") CONF_PATH = os.path.join(ZINGG_HOME, "config") PHASES_PATH = os.path.join(ZINGG_HOME, "python/phases") +GENERATEDCODE_PATH = os.path.join(ZINGG_HOME, "python/zinggGenerated") SCRIPTS_TARGET = os.path.join("zingg", "scripts") JARS_TARGET = os.path.join("zingg", "jars") @@ -63,6 +64,7 @@ DATA_TARGET = os.path.join("zingg", "models") CONF_TARGET = os.path.join("zingg", "config") PHASES_TARGET = os.path.join("zingg", "phases") +GENERATEDCODE_TARGET = os.path.join("zingg", "zinggGenerated") # Check and see if we are under the Zingg path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Zingg otherwise we @@ -112,6 +114,7 @@ def run(self): os.symlink(DATA_PATH, DATA_TARGET) os.symlink(CONF_PATH, CONF_TARGET) os.symlink(PHASES_PATH, PHASES_TARGET) + os.symlink(GENERATEDCODE_PATH, GENERATEDCODE_TARGET) else: # For windows fall back to the slower copytree copytree(JARS_PATH, JARS_TARGET) @@ -120,6 +123,7 @@ def run(self): copytree(DATA_PATH, DATA_TARGET) copytree(CONF_PATH, CONF_TARGET) copytree(PHASES_PATH, PHASES_TARGET) + copytree(GENERATEDCODE_PATH, GENERATEDCODE_TARGET) else: # If we are not inside of ZINGG_HOME verify we have the required symlink farm if not os.path.exists(JARS_TARGET): @@ -158,7 +162,8 @@ def run(self): 'zingg.data': 'zingg/models', 'zingg.examples': 'zingg/examples', 'zingg.conf': 'zingg/config', - 'zingg.phases': 'zingg/phases' + 'zingg.phases': 'zingg/phases', + 'zingg.zinggGenerated': 'zingg/zinggGenerated' }, package_data={ 'zingg.jars': ['*.jar'], @@ -167,6 +172,7 @@ def run(self): 'zingg.examples': ['*.py', '*/examples/*.py'], 'zingg.conf': ['*'], 'zingg.phases': ['*'], + 'zingg.zinggGenerated': ['*'], '':['*.py'], '':['LICENSE'] }, @@ -198,6 +204,7 @@ def run(self): os.remove(os.path.join("zingg", "examples")) os.remove(os.path.join("zingg", "phases")) os.remove(os.path.join("zingg", "config")) + os.remove(os.path.join("zingg", "zinggGenerated")) else: rmtree(os.path.join("zingg", "jars")) rmtree(os.path.join("zingg", "scripts")) @@ -205,3 +212,4 @@ def run(self): rmtree(os.path.join("zingg", "examples")) rmtree(os.path.join("zingg", "phases")) rmtree(os.path.join("zingg", "config")) + rmtree(os.path.join("zingg", "zinggGenerated")) diff --git a/python/zingg/otherThanGeneratedArguments.py b/python/zingg/otherThanGeneratedArguments.py index 113d08ead..5abe1c7f5 100644 --- a/python/zingg/otherThanGeneratedArguments.py +++ b/python/zingg/otherThanGeneratedArguments.py @@ -1,4 +1,4 @@ -from zingg.ArgumentsGenerated import * +from zingg.zinggGenerated.client import * from zingg.otherThanGeneratedFieldDefinition import * class ExtendedArgumentsGenerated(Arguments): diff --git a/python/zingg/otherThanGeneratedFieldDefinition.py b/python/zingg/otherThanGeneratedFieldDefinition.py index 43f3d229e..195499d07 100644 --- a/python/zingg/otherThanGeneratedFieldDefinition.py +++ b/python/zingg/otherThanGeneratedFieldDefinition.py @@ -1,4 +1,4 @@ -from zingg.FieldDefinitionGenerated import * +from zingg.zinggGenerated.client import * class ExtendedFieldDefinitionGenerated(FieldDefinition): def __init__(self, name, dataType, *matchType): diff --git a/python/zingg/otherThanGeneratedPipe.py b/python/zingg/otherThanGeneratedPipe.py index a46df2794..e405b3386 100644 --- a/python/zingg/otherThanGeneratedPipe.py +++ b/python/zingg/otherThanGeneratedPipe.py @@ -1,4 +1,4 @@ -from zingg.PipeGenerated import * +from zingg.zinggGenerated.pipes import * class ExtendedPipeGenerated(Pipe): def __init__(self, name, format): diff --git a/python/zingg/client.py b/python/zinggOld/client.py similarity index 100% rename from python/zingg/client.py rename to python/zinggOld/client.py diff --git a/python/zingg/pipes.py b/python/zinggOld/pipes.py similarity index 100% rename from python/zingg/pipes.py rename to python/zinggOld/pipes.py From 739a4713094f0f5949dd28fb1d98a72816e2bff7 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 21 Feb 2024 15:46:38 +0530 Subject: [PATCH 077/219] moved pipe util and related classes to client --- .../src/main/java/zingg/common/client}/util/DFReader.java | 0 .../src/main/java/zingg/common/client}/util/DFWriter.java | 0 .../src/main/java/zingg/common/client}/util/DSUtil.java | 0 .../src/main/java/zingg/common/client}/util/PipeUtil.java | 0 .../src/main/java/zingg/common/client}/util/PipeUtilBase.java | 0 .../src/main/java/zingg/spark/client}/util/SparkDFReader.java | 0 .../src/main/java/zingg/spark/client}/util/SparkDFWriter.java | 0 .../src/main/java/zingg/spark/client}/util/SparkDSUtil.java | 0 .../src/main/java/zingg/spark/client}/util/SparkPipeUtil.java | 0 spark/{core => client}/src/main/scala/reifier/scala/DFUtil.scala | 0 .../src/main/scala/reifier/scala/MyPolyExpansion.scala | 0 .../{core => client}/src/main/scala/reifier/scala/TypeTags.scala | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename common/{core/src/main/java/zingg/common/core => client/src/main/java/zingg/common/client}/util/DFReader.java (100%) rename common/{core/src/main/java/zingg/common/core => client/src/main/java/zingg/common/client}/util/DFWriter.java (100%) rename common/{core/src/main/java/zingg/common/core => client/src/main/java/zingg/common/client}/util/DSUtil.java (100%) rename common/{core/src/main/java/zingg/common/core => client/src/main/java/zingg/common/client}/util/PipeUtil.java (100%) rename common/{core/src/main/java/zingg/common/core => client/src/main/java/zingg/common/client}/util/PipeUtilBase.java (100%) rename spark/{core/src/main/java/zingg/spark/core => client/src/main/java/zingg/spark/client}/util/SparkDFReader.java (100%) rename spark/{core/src/main/java/zingg/spark/core => client/src/main/java/zingg/spark/client}/util/SparkDFWriter.java (100%) rename spark/{core/src/main/java/zingg/spark/core => client/src/main/java/zingg/spark/client}/util/SparkDSUtil.java (100%) rename spark/{core/src/main/java/zingg/spark/core => client/src/main/java/zingg/spark/client}/util/SparkPipeUtil.java (100%) rename spark/{core => client}/src/main/scala/reifier/scala/DFUtil.scala (100%) rename spark/{core => client}/src/main/scala/reifier/scala/MyPolyExpansion.scala (100%) rename spark/{core => client}/src/main/scala/reifier/scala/TypeTags.scala (100%) diff --git a/common/core/src/main/java/zingg/common/core/util/DFReader.java b/common/client/src/main/java/zingg/common/client/util/DFReader.java similarity index 100% rename from common/core/src/main/java/zingg/common/core/util/DFReader.java rename to common/client/src/main/java/zingg/common/client/util/DFReader.java diff --git a/common/core/src/main/java/zingg/common/core/util/DFWriter.java b/common/client/src/main/java/zingg/common/client/util/DFWriter.java similarity index 100% rename from common/core/src/main/java/zingg/common/core/util/DFWriter.java rename to common/client/src/main/java/zingg/common/client/util/DFWriter.java diff --git a/common/core/src/main/java/zingg/common/core/util/DSUtil.java b/common/client/src/main/java/zingg/common/client/util/DSUtil.java similarity index 100% rename from common/core/src/main/java/zingg/common/core/util/DSUtil.java rename to common/client/src/main/java/zingg/common/client/util/DSUtil.java diff --git a/common/core/src/main/java/zingg/common/core/util/PipeUtil.java b/common/client/src/main/java/zingg/common/client/util/PipeUtil.java similarity index 100% rename from common/core/src/main/java/zingg/common/core/util/PipeUtil.java rename to common/client/src/main/java/zingg/common/client/util/PipeUtil.java diff --git a/common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java b/common/client/src/main/java/zingg/common/client/util/PipeUtilBase.java similarity index 100% rename from common/core/src/main/java/zingg/common/core/util/PipeUtilBase.java rename to common/client/src/main/java/zingg/common/client/util/PipeUtilBase.java diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkDFReader.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFReader.java similarity index 100% rename from spark/core/src/main/java/zingg/spark/core/util/SparkDFReader.java rename to spark/client/src/main/java/zingg/spark/client/util/SparkDFReader.java diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkDFWriter.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFWriter.java similarity index 100% rename from spark/core/src/main/java/zingg/spark/core/util/SparkDFWriter.java rename to spark/client/src/main/java/zingg/spark/client/util/SparkDFWriter.java diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkDSUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDSUtil.java similarity index 100% rename from spark/core/src/main/java/zingg/spark/core/util/SparkDSUtil.java rename to spark/client/src/main/java/zingg/spark/client/util/SparkDSUtil.java diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkPipeUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkPipeUtil.java similarity index 100% rename from spark/core/src/main/java/zingg/spark/core/util/SparkPipeUtil.java rename to spark/client/src/main/java/zingg/spark/client/util/SparkPipeUtil.java diff --git a/spark/core/src/main/scala/reifier/scala/DFUtil.scala b/spark/client/src/main/scala/reifier/scala/DFUtil.scala similarity index 100% rename from spark/core/src/main/scala/reifier/scala/DFUtil.scala rename to spark/client/src/main/scala/reifier/scala/DFUtil.scala diff --git a/spark/core/src/main/scala/reifier/scala/MyPolyExpansion.scala b/spark/client/src/main/scala/reifier/scala/MyPolyExpansion.scala similarity index 100% rename from spark/core/src/main/scala/reifier/scala/MyPolyExpansion.scala rename to spark/client/src/main/scala/reifier/scala/MyPolyExpansion.scala diff --git a/spark/core/src/main/scala/reifier/scala/TypeTags.scala b/spark/client/src/main/scala/reifier/scala/TypeTags.scala similarity index 100% rename from spark/core/src/main/scala/reifier/scala/TypeTags.scala rename to spark/client/src/main/scala/reifier/scala/TypeTags.scala From bea36e6f28260971d9d4623d17ceb59f29ab5599 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 21 Feb 2024 15:52:31 +0530 Subject: [PATCH 078/219] moved session, some of the utils to client --- .../main/java/zingg/common/client/Client.java | 17 ++++++++-- .../main/java/zingg/common/client/IZingg.java | 2 +- .../zingg/common/client/util/DFReader.java | 2 +- .../zingg/common/client/util/DFWriter.java | 2 +- .../java/zingg/common/client/util/DSUtil.java | 4 +-- .../zingg/common/client/util/PipeUtil.java | 3 +- .../common/client/util/PipeUtilBase.java | 2 +- .../zingg/common/core/context/Context.java | 6 ++-- .../common/core/executor/FindAndLabeller.java | 8 ++--- .../common/core/executor/TrainMatcher.java | 8 ++--- .../zingg/common/core/executor/ZinggBase.java | 8 ++--- .../common/core/preprocess/StopWords.java | 2 +- .../core/preprocess/StopWordsRemover.java | 2 +- .../common/core/util/BlockingTreeUtil.java | 1 + .../core/executor/TestExecutorsGeneric.java | 12 +++---- spark/client/pom.xml | 25 +++++++++++++++ .../java/zingg/spark/client/SparkClient.java | 32 ++++++++++++++++--- .../spark/client/util/SparkDFReader.java | 4 +-- .../spark/client/util/SparkDFWriter.java | 4 +-- .../zingg/spark/client/util/SparkDSUtil.java | 4 +-- .../spark/client/util/SparkPipeUtil.java | 8 ++--- spark/core/pom.xml | 25 --------------- .../spark/core/context/ZinggSparkContext.java | 27 ++++++++-------- .../spark/core/executor/SparkDocumenter.java | 6 ++-- .../core/executor/SparkFindAndLabeller.java | 6 ++-- .../core/executor/SparkLabelUpdater.java | 6 ++-- .../spark/core/executor/SparkLabeller.java | 6 ++-- .../spark/core/executor/SparkLinker.java | 6 ++-- .../spark/core/executor/SparkMatcher.java | 6 ++-- .../spark/core/executor/SparkPeekModel.java | 6 ++-- .../spark/core/executor/SparkRecommender.java | 6 ++-- .../core/executor/SparkTrainMatcher.java | 6 ++-- .../spark/core/executor/SparkTrainer.java | 6 ++-- .../executor/SparkTrainingDataFinder.java | 6 ++-- .../core/util/SparkBlockingTreeUtil.java | 2 +- .../src/test/java/zingg/TestFebrlDataset.java | 2 +- .../spark/core/executor/ZinggSparkTester.java | 6 ++-- 37 files changed, 159 insertions(+), 125 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index e9ba29f26..ed566f7c1 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -15,6 +15,7 @@ import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.Email; import zingg.common.client.util.EmailBody; +import zingg.common.client.util.PipeUtilBase; /** * This is the main point of interface with the Zingg matching product. @@ -29,7 +30,7 @@ public abstract class Client implements Serializable { protected IZingg zingg; protected ClientOptions options; protected S session; - + protected PipeUtilBase pipeUtil; public static final Log LOG = LogFactory.getLog(Client.class); protected String zFactoryClassName; @@ -258,7 +259,7 @@ else if (options.get(ClientOptions.CONF).value.endsWith("env")) { } public void init() throws ZinggClientException { - zingg.init(getArguments()); + zingg.init(getArguments(), getSession()); if (session != null) zingg.setSession(session); } @@ -342,4 +343,16 @@ public void initializeListeners() { addListener(new ZinggStopEvent(), new ZinggStopListener()); } + public abstract S getSession(); + + public void setSession(S s) { + this.session = s; + } + + public abstract PipeUtilBase getPipeUtil(); + + public void setPipeUtil(PipeUtilBase pipeUtil) { + this.pipeUtil = pipeUtil; + } + } \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/IZingg.java b/common/client/src/main/java/zingg/common/client/IZingg.java index 19448421d..61bd8133e 100644 --- a/common/client/src/main/java/zingg/common/client/IZingg.java +++ b/common/client/src/main/java/zingg/common/client/IZingg.java @@ -2,7 +2,7 @@ public interface IZingg { - public void init(IArguments args) + public void init(IArguments args, S session) throws ZinggClientException; public void execute() throws ZinggClientException; diff --git a/common/client/src/main/java/zingg/common/client/util/DFReader.java b/common/client/src/main/java/zingg/common/client/util/DFReader.java index 6bf84940b..89f867752 100644 --- a/common/client/src/main/java/zingg/common/client/util/DFReader.java +++ b/common/client/src/main/java/zingg/common/client/util/DFReader.java @@ -1,4 +1,4 @@ -package zingg.common.core.util; +package zingg.common.client.util; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; diff --git a/common/client/src/main/java/zingg/common/client/util/DFWriter.java b/common/client/src/main/java/zingg/common/client/util/DFWriter.java index c41e97196..9ddbfc88f 100644 --- a/common/client/src/main/java/zingg/common/client/util/DFWriter.java +++ b/common/client/src/main/java/zingg/common/client/util/DFWriter.java @@ -1,4 +1,4 @@ -package zingg.common.core.util; +package zingg.common.client.util; public interface DFWriter { diff --git a/common/client/src/main/java/zingg/common/client/util/DSUtil.java b/common/client/src/main/java/zingg/common/client/util/DSUtil.java index 15c4e2091..f8d2f8108 100644 --- a/common/client/src/main/java/zingg/common/client/util/DSUtil.java +++ b/common/client/src/main/java/zingg/common/client/util/DSUtil.java @@ -1,4 +1,4 @@ -package zingg.common.core.util; +package zingg.common.client.util; import zingg.common.client.FieldDefinition; @@ -7,8 +7,6 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.pipe.Pipe; -import zingg.common.client.util.ColName; -import zingg.common.client.util.ColValues; import java.util.ArrayList; import java.util.List; diff --git a/common/client/src/main/java/zingg/common/client/util/PipeUtil.java b/common/client/src/main/java/zingg/common/client/util/PipeUtil.java index 166afd7a5..415a4e36a 100644 --- a/common/client/src/main/java/zingg/common/client/util/PipeUtil.java +++ b/common/client/src/main/java/zingg/common/client/util/PipeUtil.java @@ -1,4 +1,4 @@ -package zingg.common.core.util; +package zingg.common.client.util; import java.util.Arrays; import java.util.stream.Collectors; @@ -12,7 +12,6 @@ import zingg.common.client.pipe.FilePipe; //import zingg.common.client.pipe.InMemoryPipe; import zingg.common.client.pipe.Pipe; -import zingg.common.client.util.ColName; //import com.datastax.spark.connector.cql.*; //import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL; diff --git a/common/client/src/main/java/zingg/common/client/util/PipeUtilBase.java b/common/client/src/main/java/zingg/common/client/util/PipeUtilBase.java index b6e87ba3a..b293d0b71 100644 --- a/common/client/src/main/java/zingg/common/client/util/PipeUtilBase.java +++ b/common/client/src/main/java/zingg/common/client/util/PipeUtilBase.java @@ -1,4 +1,4 @@ -package zingg.common.core.util; +package zingg.common.client.util; import zingg.common.client.IArguments; import zingg.common.client.ZFrame; diff --git a/common/core/src/main/java/zingg/common/core/context/Context.java b/common/core/src/main/java/zingg/common/core/context/Context.java index 130cbbe5c..410e3ae3d 100644 --- a/common/core/src/main/java/zingg/common/core/context/Context.java +++ b/common/core/src/main/java/zingg/common/core/context/Context.java @@ -3,12 +3,12 @@ import java.io.Serializable; import zingg.common.client.ZinggClientException; +import zingg.common.client.util.DSUtil; +import zingg.common.client.util.PipeUtilBase; import zingg.common.core.util.BlockingTreeUtil; -import zingg.common.core.util.DSUtil; import zingg.common.core.util.GraphUtil; import zingg.common.core.util.HashUtil; import zingg.common.core.util.ModelUtil; -import zingg.common.core.util.PipeUtilBase; public abstract class Context implements Serializable { protected S session; @@ -62,7 +62,7 @@ public BlockingTreeUtil getBlockingTreeUtil() { return this.blockingTreeUtil; } - public abstract void init() + public abstract void init(S session) throws ZinggClientException; public abstract void cleanup(); diff --git a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java index 8b791097c..b8eb3eff0 100644 --- a/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/FindAndLabeller.java @@ -20,10 +20,10 @@ public FindAndLabeller() { } @Override - public void init(IArguments args) throws ZinggClientException { - finder.init(args); - labeller.init(args); - super.init(args); + public void init(IArguments args, S s) throws ZinggClientException { + finder.init(args,s); + labeller.init(args,s); + super.init(args,s); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java index e185a9f2f..b4fdfc97e 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainMatcher.java @@ -21,11 +21,11 @@ public TrainMatcher() { } @Override - public void init(IArguments args) + public void init(IArguments args, S s) throws ZinggClientException { - trainer.init(args); - matcher.init(args); - super.init(args); + trainer.init(args,s); + matcher.init(args,s); + super.init(args,s); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java index 222c620a8..b9f07c32f 100644 --- a/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java +++ b/common/core/src/main/java/zingg/common/core/executor/ZinggBase.java @@ -17,15 +17,15 @@ import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; +import zingg.common.client.util.DSUtil; +import zingg.common.client.util.PipeUtilBase; import zingg.common.core.context.Context; import zingg.common.core.util.Analytics; import zingg.common.core.util.BlockingTreeUtil; -import zingg.common.core.util.DSUtil; import zingg.common.core.util.GraphUtil; import zingg.common.core.util.HashUtil; import zingg.common.core.util.Metric; import zingg.common.core.util.ModelUtil; -import zingg.common.core.util.PipeUtilBase; public abstract class ZinggBase implements Serializable, IZingg { @@ -62,8 +62,8 @@ public ZinggBase() { } - - public void init(IArguments args) + @Override + public void init(IArguments args, S session) throws ZinggClientException { startTime = System.currentTimeMillis(); this.args = args; diff --git a/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java b/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java index ea42b7401..8e1511489 100644 --- a/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java +++ b/common/core/src/main/java/zingg/common/core/preprocess/StopWords.java @@ -7,7 +7,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; -import zingg.common.core.util.PipeUtilBase; +import zingg.common.client.util.PipeUtilBase; public class StopWords { diff --git a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java index 7bdf88ac8..9742426c4 100644 --- a/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java +++ b/common/core/src/main/java/zingg/common/core/preprocess/StopWordsRemover.java @@ -13,8 +13,8 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; +import zingg.common.client.util.PipeUtilBase; import zingg.common.core.context.Context; -import zingg.common.core.util.PipeUtilBase; public abstract class StopWordsRemover implements Serializable{ diff --git a/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java b/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java index 97e397230..bab1f04fa 100644 --- a/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java +++ b/common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java @@ -12,6 +12,7 @@ import zingg.common.client.ZinggClientException; import zingg.common.client.ZFrame; import zingg.common.client.util.ListMap; +import zingg.common.client.util.PipeUtilBase; import zingg.common.client.util.Util; import zingg.common.core.block.Block; import zingg.common.core.block.Canopy; diff --git a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java index 424eec10c..c9c3d53dd 100644 --- a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java +++ b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java @@ -50,33 +50,33 @@ public void testExecutors() throws ZinggClientException { List> executorTesterList = new ArrayList>(); TrainingDataFinder trainingDataFinder = getTrainingDataFinder(); - trainingDataFinder.init(args); + trainingDataFinder.init(args,session); TrainingDataFinderTester tdft = new TrainingDataFinderTester(trainingDataFinder); executorTesterList.add(tdft); Labeller labeller = getLabeller(); - labeller.init(args); + labeller.init(args,session); LabellerTester lt = new LabellerTester(labeller); executorTesterList.add(lt); // training and labelling needed twice to get sufficient data TrainingDataFinder trainingDataFinder2 = getTrainingDataFinder(); - trainingDataFinder2.init(args); + trainingDataFinder2.init(args,session); TrainingDataFinderTester tdft2 = new TrainingDataFinderTester(trainingDataFinder2); executorTesterList.add(tdft2); Labeller labeller2 = getLabeller(); - labeller2.init(args); + labeller2.init(args,session); LabellerTester lt2 = new LabellerTester(labeller2); executorTesterList.add(lt2); Trainer trainer = getTrainer(); - trainer.init(args); + trainer.init(args,session); TrainerTester tt = new TrainerTester(trainer); executorTesterList.add(tt); Matcher matcher = getMatcher(); - matcher.init(args); + matcher.init(args,session); MatcherTester mt = new MatcherTester(matcher); executorTesterList.add(mt); diff --git a/spark/client/pom.xml b/spark/client/pom.xml index 0515ed51b..10ee01930 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -43,6 +43,31 @@ + + net.alchim31.maven + scala-maven-plugin + 4.8.0 + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + org.apache.maven.plugins maven-javadoc-plugin diff --git a/spark/client/src/main/java/zingg/spark/client/SparkClient.java b/spark/client/src/main/java/zingg/spark/client/SparkClient.java index 7bc3213f3..ae61414c4 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkClient.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkClient.java @@ -1,18 +1,17 @@ package zingg.spark.client; -import java.io.Serializable; - import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; import zingg.common.client.Client; import zingg.common.client.ClientOptions; import zingg.common.client.IArguments; -import zingg.common.client.IZinggFactory; import zingg.common.client.ZinggClientException; +import zingg.common.client.util.PipeUtilBase; +import zingg.spark.client.util.SparkPipeUtil; /** * This is the main point of interface with the Zingg matching product. * @@ -71,7 +70,30 @@ public static void main(String... args) { client.mainMethod(args); } + @Override + public SparkSession getSession() { + if (session!=null) { + return session; + } else { + SparkSession s = SparkSession + .builder() + .appName("Zingg") + .getOrCreate(); + setSession(s); + return s; + } + + } - + @Override + public PipeUtilBase, Row, Column> getPipeUtil() { + if (pipeUtil!=null) { + return pipeUtil; + } else { + PipeUtilBase, Row, Column> p = new SparkPipeUtil(session); + setPipeUtil(p); + return p; + } + } } \ No newline at end of file diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkDFReader.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFReader.java index eeab194c4..cf67ff1f6 100644 --- a/spark/client/src/main/java/zingg/spark/client/util/SparkDFReader.java +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkDFReader.java @@ -1,4 +1,4 @@ -package zingg.spark.core.util; +package zingg.spark.client.util; import org.apache.spark.sql.Column; import org.apache.spark.sql.DataFrameReader; @@ -8,7 +8,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.core.util.DFReader; +import zingg.common.client.util.DFReader; import zingg.spark.client.SparkFrame; import org.apache.spark.sql.SparkSession; diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkDFWriter.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFWriter.java index 714c022f0..023a90fb6 100644 --- a/spark/client/src/main/java/zingg/spark/client/util/SparkDFWriter.java +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkDFWriter.java @@ -1,4 +1,4 @@ -package zingg.spark.core.util; +package zingg.spark.client.util; import org.apache.spark.sql.Column; import org.apache.spark.sql.DataFrameWriter; @@ -7,7 +7,7 @@ import org.apache.spark.sql.SaveMode; import zingg.common.client.ZFrame; -import zingg.common.core.util.DFWriter; +import zingg.common.client.util.DFWriter; public class SparkDFWriter implements DFWriter, Row, Column>{ private DataFrameWriter writer; diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkDSUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDSUtil.java index 939b3b87d..ec7df7128 100644 --- a/spark/client/src/main/java/zingg/spark/client/util/SparkDSUtil.java +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkDSUtil.java @@ -1,4 +1,4 @@ -package zingg.spark.core.util; +package zingg.spark.client.util; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import zingg.common.client.ZFrame; -import zingg.common.core.util.DSUtil; +import zingg.common.client.util.DSUtil; import zingg.scala.DFUtil; import zingg.spark.client.SparkFrame; import org.apache.spark.sql.SparkSession; diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkPipeUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkPipeUtil.java index 3c4dd617f..51530e7d3 100644 --- a/spark/client/src/main/java/zingg/spark/client/util/SparkPipeUtil.java +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkPipeUtil.java @@ -1,4 +1,4 @@ -package zingg.spark.core.util; +package zingg.spark.client.util; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -10,9 +10,9 @@ import zingg.common.client.ZFrame; //import zingg.common.client.pipe.InMemoryPipe; import zingg.common.client.pipe.Pipe; -import zingg.common.core.util.DFReader; -import zingg.common.core.util.DFWriter; -import zingg.common.core.util.PipeUtil; +import zingg.common.client.util.DFReader; +import zingg.common.client.util.DFWriter; +import zingg.common.client.util.PipeUtil; import zingg.spark.client.SparkFrame; import org.apache.spark.sql.SparkSession; diff --git a/spark/core/pom.xml b/spark/core/pom.xml index bbc568fb0..82aa0f55d 100644 --- a/spark/core/pom.xml +++ b/spark/core/pom.xml @@ -55,31 +55,6 @@ - - net.alchim31.maven - scala-maven-plugin - 4.8.0 - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - org.apache.maven.plugins maven-jar-plugin diff --git a/spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java b/spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java index 2a39c6942..c9fcaac34 100644 --- a/spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java +++ b/spark/core/src/main/java/zingg/spark/core/context/ZinggSparkContext.java @@ -11,20 +11,20 @@ import zingg.common.client.IZingg; import zingg.common.client.ZinggClientException; +import zingg.common.client.util.DSUtil; +import zingg.common.client.util.PipeUtilBase; // import zingg.common.core.context.Context; import zingg.common.core.util.BlockingTreeUtil; -import zingg.common.core.util.DSUtil; import zingg.common.core.util.GraphUtil; import zingg.common.core.util.HashUtil; import zingg.common.core.util.ModelUtil; -import zingg.common.core.util.PipeUtilBase; +import zingg.spark.client.util.SparkDSUtil; +import zingg.spark.client.util.SparkPipeUtil; import zingg.spark.core.util.SparkBlockingTreeUtil; -import zingg.spark.core.util.SparkDSUtil; import zingg.spark.core.util.SparkGraphUtil; import zingg.spark.core.util.SparkHashUtil; import zingg.spark.core.util.SparkModelUtil; -import zingg.spark.core.util.SparkPipeUtil; public class ZinggSparkContext extends Context, Row,Column,DataType>{ @@ -37,17 +37,18 @@ public class ZinggSparkContext extends Context, Row,C @Override - public void init() + public void init(SparkSession session) throws ZinggClientException { try{ - if (session==null) { - session = SparkSession - .builder() - .appName("Zingg") - .getOrCreate(); - - //session = new SparkSession(spark, license); - } +// if (session==null) { +// session = SparkSession +// .builder() +// .appName("Zingg") +// .getOrCreate(); +// +// //session = new SparkSession(spark, license); +// } + this.session = session; if (ctx==null) { ctx = JavaSparkContext.fromSparkContext(session.sparkContext()); JavaSparkContext.jarOfClass(IZingg.class); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java index 5de555464..98e452c90 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkDocumenter.java @@ -36,9 +36,9 @@ public SparkDocumenter(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java index 01f377348..0c0aeb550 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkFindAndLabeller.java @@ -35,9 +35,9 @@ public SparkFindAndLabeller(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java index bf24ea9a4..33dcbd706 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabelUpdater.java @@ -39,9 +39,9 @@ public SparkLabelUpdater(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } public Pipe setSaveModeOnPipe(Pipe p) { diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java index 831bd5844..e8aa8f6ec 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLabeller.java @@ -37,9 +37,9 @@ public SparkLabeller(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java index 43e14423a..85f442314 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkLinker.java @@ -34,9 +34,9 @@ public SparkLinker(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java index 389a08df1..6cb0bc1cd 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkMatcher.java @@ -40,9 +40,9 @@ public SparkMatcher(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java index c82bf15ed..115390b85 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkPeekModel.java @@ -34,13 +34,13 @@ public SparkPeekModel() { } @Override - public void init(IArguments args) + public void init(IArguments args, SparkSession s) throws ZinggClientException { - super.init(args); + super.init(args,s); getContext().setUtils(); //we wil not init here as we wnt py to drive //the spark session etc - getContext().init(); + getContext().init(s); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java index 0dc03bd9e..cf608a6e9 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkRecommender.java @@ -38,9 +38,9 @@ public SparkRecommender(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java index a2772a124..699af83bf 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainMatcher.java @@ -34,9 +34,9 @@ public SparkTrainMatcher(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } } diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java index 6d93abc2b..e23c5b043 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainer.java @@ -35,9 +35,9 @@ public SparkTrainer(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java index bffac326c..012effdab 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkTrainingDataFinder.java @@ -32,9 +32,9 @@ public SparkTrainingDataFinder(ZinggSparkContext sparkContext) { } @Override - public void init(IArguments args) throws ZinggClientException { - super.init(args); - getContext().init(); + public void init(IArguments args, SparkSession s) throws ZinggClientException { + super.init(args,s); + getContext().init(s); } @Override diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java index 674e1e877..984e07b83 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java @@ -20,12 +20,12 @@ import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; import zingg.common.client.util.ListMap; +import zingg.common.client.util.PipeUtilBase; import zingg.common.core.block.Block; import zingg.common.core.block.Canopy; import zingg.common.core.block.Tree; import zingg.common.core.hash.HashFunction; import zingg.common.core.util.BlockingTreeUtil; -import zingg.common.core.util.PipeUtilBase; import zingg.spark.client.SparkFrame; import org.apache.spark.sql.SparkSession; import zingg.spark.core.block.SparkBlock; diff --git a/spark/core/src/test/java/zingg/TestFebrlDataset.java b/spark/core/src/test/java/zingg/TestFebrlDataset.java index 5b329c57a..a7ef49128 100644 --- a/spark/core/src/test/java/zingg/TestFebrlDataset.java +++ b/spark/core/src/test/java/zingg/TestFebrlDataset.java @@ -50,7 +50,7 @@ public void setUp() throws Exception, ZinggClientException{ public void testModelAccuracy(){ TrainMatcher tm = new SparkTrainMatcher(); try { - tm.init(args); + tm.init(args,spark); // tm.setSpark(spark); // tm.setCtx(ctx); tm.setArgs(args); diff --git a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java index f3a47d616..d15109f75 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java @@ -22,13 +22,13 @@ import zingg.common.client.IZingg; import org.apache.spark.sql.SparkSession; +import zingg.spark.client.util.SparkDSUtil; +import zingg.spark.client.util.SparkPipeUtil; import zingg.spark.core.context.ZinggSparkContext; import zingg.spark.core.util.SparkBlockingTreeUtil; -import zingg.spark.core.util.SparkDSUtil; import zingg.spark.core.util.SparkGraphUtil; import zingg.spark.core.util.SparkHashUtil; import zingg.spark.core.util.SparkModelUtil; -import zingg.spark.core.util.SparkPipeUtil; public class ZinggSparkTester { @@ -54,7 +54,7 @@ public static void setup() { JavaSparkContext.jarOfClass(IZingg.class); args = new Arguments(); zsCTX = new ZinggSparkContext(); - zsCTX.init(); + zsCTX.init(spark); } catch (Throwable e) { if (LOG.isDebugEnabled()) From 5940d99cb8f8176749a43db438ef02895dc4bcef Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Fri, 23 Feb 2024 11:03:44 +0530 Subject: [PATCH 079/219] Add parent and outputDirectory parameter in annotations --- .../java/zingg/common/client/Arguments.java | 2 +- .../zingg/common/client/FieldDefinition.java | 2 +- .../java/zingg/common/client/pipe/Pipe.java | 2 +- .../common/py/annotations/PythonClass.java | 2 + .../py/processors/PythonClassProcessor.java | 49 +++++++++---------- 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index 5d48aad9d..47474c310 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -81,7 +81,7 @@ * } * */ -@PythonClass(module = "client") +@PythonClass(module = "client", outputDirectory = "python/zinggGenerated") @JsonInclude(Include.NON_NULL) public class Arguments implements Serializable, IArguments { diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index 93f58c837..55c0c52f6 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -33,7 +33,7 @@ * @author sgoyal * */ -@PythonClass(module = "client") +@PythonClass(module = "client", outputDirectory = "python/zinggGenerated") public class FieldDefinition implements Serializable { diff --git a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java index 81df7c8ff..e726160d3 100644 --- a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java +++ b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java @@ -20,7 +20,7 @@ * @author sgoyal * */ -@PythonClass(module = "pipes") +@PythonClass(module = "pipes", outputDirectory = "python/zinggGenerated") @JsonInclude(Include.NON_NULL) public class Pipe implements Serializable{ // St:StructType, Sv:SaveMode diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java index 6ef4f208e..e557f9a3c 100644 --- a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java +++ b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java @@ -6,4 +6,6 @@ @Target({ElementType.TYPE}) public @interface PythonClass { String module(); + String parent() default ""; + String outputDirectory(); } \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 3193f3b6a..3f41ac899 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -19,6 +19,7 @@ public class PythonClassProcessor extends AbstractProcessor { private Set processedElements = new HashSet<>(); + private Set folders = new HashSet<>(); @Override public synchronized void init(ProcessingEnvironment processingEnv) { @@ -26,16 +27,21 @@ public synchronized void init(ProcessingEnvironment processingEnv) { super.init(processingEnv); // Clear the output directory on initialization - String outputDirectory = "python/zinggGenerated"; - File dir = new File(outputDirectory); - if (!dir.exists()) { - dir.mkdirs(); - } else { - for (File file : dir.listFiles()) { - file.delete(); + folders.add("python/zinggGenerated"); + folders.add("common/python"); + folders.add("snowflake/python"); + folders.add("spark/python"); + + for (String folder : folders) { + File directory = new File(folder); + if (directory.exists()) { + for (File file : directory.listFiles()) { + file.delete(); + System.out.println(file + "deeellleeeeteeed"); + System.out.println(file + "geeneerateedddd"); + } } } - } @Override @@ -56,19 +62,22 @@ private void processClass(TypeElement classElement, RoundEnvironment roundEnv) { // Mark the class as processed processedElements.add(classElement); - // System.out.println("Called for " + classElement); - PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); - String packageName = packageElement.getQualifiedName().toString(); PythonClass pythonClassAnnotation = classElement.getAnnotation(PythonClass.class); - String outputDirectory = determineOutputDirectory(packageName); + String outputDirectory = pythonClassAnnotation.outputDirectory(); String moduleName = pythonClassAnnotation.module(); String outputFile = outputDirectory + File.separator + moduleName + ".py"; + String parentClassName = pythonClassAnnotation.parent(); try (FileWriter fileWriter = new FileWriter(outputFile, true)) { generateImportsAndDeclarations(classElement, fileWriter); - - fileWriter.write("class " + classElement.getSimpleName() + ":\n"); + + if (!parentClassName.isEmpty()) { + fileWriter.write("class " + classElement.getSimpleName() + "(" + parentClassName + "):\n"); + } else { + fileWriter.write("class " + classElement.getSimpleName() + ":\n"); + } + // System.out.println(classElement.getSimpleName() + "ccccccccccccccccccccccccc"); // __init__ method fileWriter.write(" def __init__(self" + generateConstructorParameters(classElement, classElement) + "):\n"); @@ -92,18 +101,6 @@ private void processClass(TypeElement classElement, RoundEnvironment roundEnv) { } catch (IOException e) { e.printStackTrace(); } - } - - private String determineOutputDirectory(String packageName) { - if (packageName.contains("enterprise") && packageName.contains("common")) { - return "common/python"; - } else if (packageName.contains("enterprise") && packageName.contains("snowflake")) { - return "snowflake/python"; - } else if (packageName.contains("enterprise") && packageName.contains("spark")) { - return "spark/python"; - } else { - return "python/zinggGenerated"; - } } private void generateImportsAndDeclarations(Element element, FileWriter fileWriter) throws IOException { From dcdb633ad3a24fbd990833c69e1f874647468ac8 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 23 Feb 2024 14:28:44 +0530 Subject: [PATCH 080/219] firing stop event --- common/client/src/main/java/zingg/common/client/Client.java | 1 + 1 file changed, 1 insertion(+) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index ed566f7c1..7281bdb4f 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -243,6 +243,7 @@ else if (options.get(ClientOptions.CONF).value.endsWith("env")) { } finally { try { + EventsListener.getInstance().fireEvent(new ZinggStopEvent()); if (client != null) { //client.postMetrics(); client.stop(); From a8cd12aacc2c9186df463d37512bf61c78ab43fa Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 23 Feb 2024 18:18:26 +0530 Subject: [PATCH 081/219] set options to get phase before initialising listeners --- .../client/src/main/java/zingg/common/client/Client.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 7281bdb4f..469d11bef 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -186,16 +186,19 @@ public void printAnalyticsBanner(boolean collectMetrics) { public abstract Client getClient(IArguments args, ClientOptions options) throws ZinggClientException; public void mainMethod(String... args) { - initializeListeners(); printBanner(); Client client = null; ClientOptions options = null; try { - EventsListener.getInstance().fireEvent(new ZinggStartEvent()); + for (String a: args) LOG.debug("args " + a); options = new ClientOptions(args); setOptions(options); - + + // after setting options as some of the listeners need options + initializeListeners(); + EventsListener.getInstance().fireEvent(new ZinggStartEvent()); + if (options.has(options.HELP) || options.has(options.HELP1) || options.get(ClientOptions.PHASE) == null) { LOG.warn(options.getHelp()); System.exit(0); From 7d0dd6be835bb686ba5e2ebb8ccf61112edda7a2 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Sun, 25 Feb 2024 17:53:05 +0530 Subject: [PATCH 082/219] args needed in start listener to access zingg dir --- .../client/src/main/java/zingg/common/client/Client.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 469d11bef..c55efa37f 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -195,10 +195,6 @@ public void mainMethod(String... args) { options = new ClientOptions(args); setOptions(options); - // after setting options as some of the listeners need options - initializeListeners(); - EventsListener.getInstance().fireEvent(new ZinggStartEvent()); - if (options.has(options.HELP) || options.has(options.HELP1) || options.get(ClientOptions.PHASE) == null) { LOG.warn(options.getHelp()); System.exit(0); @@ -216,6 +212,10 @@ else if (options.get(ClientOptions.CONF).value.endsWith("env")) { arguments = getArgsUtil().createArgumentsFromJSONString(options.get(ClientOptions.CONF).value, phase); } + // after setting arguments as some of the listeners need arguments + initializeListeners(); + EventsListener.getInstance().fireEvent(new ZinggStartEvent()); + client = getClient(arguments, options); client.init(); client.execute(); From 8d7e651fc2b3613e1f11febb49b731cc18e2bf0a Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 26 Feb 2024 14:19:40 +0530 Subject: [PATCH 083/219] args to be accessible to listener --- common/client/src/main/java/zingg/common/client/Client.java | 1 - 1 file changed, 1 deletion(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index c55efa37f..ecf8947e5 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -201,7 +201,6 @@ public void mainMethod(String... args) { } String phase = options.get(ClientOptions.PHASE).value.trim(); ZinggOptions.verifyPhase(phase); - IArguments arguments = null; if (options.get(ClientOptions.CONF).value.endsWith("json")) { arguments = getArgsUtil().createArgumentsFromJSON(options.get(ClientOptions.CONF).value, phase); } From e1d2948ab6d002e6e294904db2497d471db1493b Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 26 Feb 2024 15:10:41 +0530 Subject: [PATCH 084/219] listeners after init --- .../client/src/main/java/zingg/common/client/Client.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index ecf8947e5..d087705d5 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -211,12 +211,11 @@ else if (options.get(ClientOptions.CONF).value.endsWith("env")) { arguments = getArgsUtil().createArgumentsFromJSONString(options.get(ClientOptions.CONF).value, phase); } - // after setting arguments as some of the listeners need arguments - initializeListeners(); - EventsListener.getInstance().fireEvent(new ZinggStartEvent()); - client = getClient(arguments, options); client.init(); + // after setting arguments etc. as some of the listeners need it + initializeListeners(); + EventsListener.getInstance().fireEvent(new ZinggStartEvent()); client.execute(); client.postMetrics(); LOG.warn("Zingg processing has completed"); From bf3a697d78a723504fb612f0ab8f72bb382b0294 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 27 Feb 2024 10:39:58 +0530 Subject: [PATCH 085/219] fixing props for event data --- .../common/client/event/events/IEvent.java | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/event/events/IEvent.java b/common/client/src/main/java/zingg/common/client/event/events/IEvent.java index 53f443972..6fe90d0f2 100644 --- a/common/client/src/main/java/zingg/common/client/event/events/IEvent.java +++ b/common/client/src/main/java/zingg/common/client/event/events/IEvent.java @@ -4,11 +4,22 @@ public class IEvent { - public HashMap getProps(){ - return null; + protected HashMap eventDataProps; + + public IEvent() { + super(); + } + + public IEvent(HashMap eventDataProps) { + super(); + this.eventDataProps = eventDataProps; + } + + public HashMap getProps(){ + return eventDataProps; } public void setProps(HashMap props){ - + this.eventDataProps = props; } } From 1b954d6fc60d96ed7879830269fe497c08f0d6fe Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 28 Feb 2024 14:06:44 +0530 Subject: [PATCH 086/219] use cannonical name instead of class for key --- .../client/src/main/java/zingg/common/client/Client.java | 8 ++++---- .../common/client/event/listeners/EventsListener.java | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index d087705d5..43f27d753 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -336,13 +336,13 @@ protected ArgumentsUtil getArgsUtil() { return argsUtil; } - public void addListener(IEvent event, IEventListener listener) { - EventsListener.getInstance().addListener(event.getClass(), listener); + public void addListener(Class eventClass, IEventListener listener) { + EventsListener.getInstance().addListener(eventClass, listener); } public void initializeListeners() { - addListener(new ZinggStartEvent(), new ZinggStartListener()); - addListener(new ZinggStopEvent(), new ZinggStopListener()); + addListener(ZinggStartEvent.class, new ZinggStartListener()); + addListener(ZinggStopEvent.class, new ZinggStopListener()); } public abstract S getSession(); diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java index 3483b7f8d..d7beb1bdc 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java @@ -6,7 +6,7 @@ public class EventsListener { private static EventsListener eventsListener = null; - private final ListMap, IEventListener> eventListeners; + private final ListMap eventListeners; private EventsListener() { eventListeners = new ListMap<>(); @@ -19,7 +19,7 @@ public static EventsListener getInstance() { } public void addListener(Class eventClass, IEventListener listener) { - eventListeners.add(eventClass, listener); + eventListeners.add(eventClass.getCanonicalName(), listener); } public void fireEvent(IEvent event) throws ZinggClientException { @@ -28,7 +28,7 @@ public void fireEvent(IEvent event) throws ZinggClientException { private void listen(IEvent event) throws ZinggClientException { Class eventClass = event.getClass(); - for (IEventListener listener : eventListeners.get(eventClass)) { + for (IEventListener listener : eventListeners.get(eventClass.getCanonicalName())) { listener.listen(event); } } From 9f7f9801f6623eb3292006c921e5f0b31f208227 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Mon, 4 Mar 2024 14:25:17 +0530 Subject: [PATCH 087/219] cols and FieldDefinition.java code changes --- .../zingg/common/client/FieldDefinition.java | 18 ++++++++- .../client/cols/FieldDefSelectedCols.java | 33 +++++++++++++++++ .../common/client/cols/ISelectedCols.java | 16 ++++++++ .../java/zingg/common/client/cols/Named.java | 8 ++++ .../common/client/cols/SelectedCols.java | 37 +++++++++++++++++++ .../client/cols/ZidAndFieldDefSelector.java | 14 +++++++ 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java create mode 100644 common/client/src/main/java/zingg/common/client/cols/ISelectedCols.java create mode 100644 common/client/src/main/java/zingg/common/client/cols/Named.java create mode 100644 common/client/src/main/java/zingg/common/client/cols/SelectedCols.java create mode 100644 common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index 55c0c52f6..0adbd9e1a 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.deser.std.StdDeserializer; import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import zingg.common.client.cols.Named; import zingg.common.py.annotations.PythonClass; import zingg.common.py.annotations.PythonMethod; @@ -34,7 +35,7 @@ * */ @PythonClass(module = "client", outputDirectory = "python/zinggGenerated") -public class FieldDefinition implements +public class FieldDefinition implements Named, Serializable { private static final long serialVersionUID = 1L; @@ -125,6 +126,21 @@ public void setFieldName(String fieldName) { this.fieldName = fieldName; } + public boolean isDontUse() { + // TODO Auto-generated method stub + throw new UnsupportedOperationException("Unimplemented method 'isDontUse'"); + } + + @Override + public String getName() { + return getFieldName(); + } + + @Override + public void setName(String name) { + setFieldName(name); + } + @Override public int hashCode() { final int prime = 31; diff --git a/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java new file mode 100644 index 000000000..d359b6c0f --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java @@ -0,0 +1,33 @@ +package zingg.common.client.cols; + +import java.util.ArrayList; +import java.util.List; + +import zingg.common.client.FieldDefinition; + +public class FieldDefSelectedCols extends SelectedCols { + + public FieldDefSelectedCols(List fieldDefs, boolean showConcise) { + + List namedList = new ArrayList<>(); + + for (FieldDefinition fieldDef : fieldDefs) { + if (showConcise && fieldDef.isDontUse()) { + continue; + } + namedList.add(fieldDef); + } + + namedList.add(new FieldDefinition()); + List stringList = convertNamedListToStringList(namedList); + setCols(stringList); + } + + private List convertNamedListToStringList(List namedList) { + List stringList = new ArrayList<>(); + for (FieldDefinition named : namedList) { + stringList.add(named.getName()); + } + return stringList; + } +} \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/cols/ISelectedCols.java b/common/client/src/main/java/zingg/common/client/cols/ISelectedCols.java new file mode 100644 index 000000000..1d48fc945 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/cols/ISelectedCols.java @@ -0,0 +1,16 @@ +package zingg.common.client.cols; + +import java.util.List; + +public interface ISelectedCols { + + String[] getCols(List n); + + String[] getCols(); + + void setCols(List cols); + + void setNamedCols(List n); + + void setStringCols(List cols); +} \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/cols/Named.java b/common/client/src/main/java/zingg/common/client/cols/Named.java new file mode 100644 index 000000000..1fbe2a0a6 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/cols/Named.java @@ -0,0 +1,8 @@ +package zingg.common.client.cols; + +public interface Named { + + String getName(); + + void setName(String name); +} \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/cols/SelectedCols.java b/common/client/src/main/java/zingg/common/client/cols/SelectedCols.java new file mode 100644 index 000000000..106afa534 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/cols/SelectedCols.java @@ -0,0 +1,37 @@ +package zingg.common.client.cols; + +import java.util.List; + +public class SelectedCols implements ISelectedCols { + + private String[] cols; + + @Override + public String[] getCols(List n) { + String[] result = new String[n.size()]; + for (int i = 0; i < n.size(); i++) { + result[i] = n.get(i).getName(); + } + return result; + } + + @Override + public String[] getCols() { + return cols; + } + + @Override + public void setCols(List strings) { + this.cols = strings.toArray(new String[0]); + } + + @Override + public void setNamedCols(List n) { + this.cols = getCols(n); + } + + @Override + public void setStringCols(List columnNames) { + this.cols = columnNames.toArray(new String[0]); + } +} \ No newline at end of file diff --git a/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java b/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java new file mode 100644 index 000000000..3031c5bac --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java @@ -0,0 +1,14 @@ +package zingg.common.client.cols; + +import java.util.Arrays; +import java.util.List; + +public class ZidAndFieldDefSelector extends SelectedCols { + + public ZidAndFieldDefSelector(String[] fieldDefs) { + + List fieldDefList = Arrays.asList(fieldDefs); + fieldDefList.add(0, "zid"); + setCols(fieldDefList); + } +} \ No newline at end of file From 6e812ae36bd04f283612a42ebab039fb84713b27 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Mon, 4 Mar 2024 14:29:59 +0530 Subject: [PATCH 088/219] colName.ID_COL constant --- .../java/zingg/common/client/cols/ZidAndFieldDefSelector.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java b/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java index 3031c5bac..8511ea43e 100644 --- a/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java +++ b/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java @@ -3,12 +3,14 @@ import java.util.Arrays; import java.util.List; +import zingg.common.client.util.ColName; + public class ZidAndFieldDefSelector extends SelectedCols { public ZidAndFieldDefSelector(String[] fieldDefs) { List fieldDefList = Arrays.asList(fieldDefs); - fieldDefList.add(0, "zid"); + fieldDefList.add(0, ColName.ID_COL); setCols(fieldDefList); } } \ No newline at end of file From ba295549fd4d12c92d8098aa532acfa5d5217e08 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 4 Mar 2024 17:15:02 +0530 Subject: [PATCH 089/219] using ZidAndFieldDefSelector to select cols --- .../common/client/ILabelDataViewHelper.java | 2 +- .../client/cols/FieldDefSelectedCols.java | 26 +++++++++++++------ .../client/cols/ZidAndFieldDefSelector.java | 22 +++++++++++----- .../core/executor/LabelDataViewHelper.java | 12 ++++----- .../common/core/executor/LabelUpdater.java | 10 +++---- .../zingg/common/core/executor/Labeller.java | 7 +++-- .../zingg/common/core/executor/Matcher.java | 5 +++- .../core/executor/TrainingDataFinder.java | 16 +++++++++--- 8 files changed, 65 insertions(+), 35 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java b/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java index 89e2ae44f..6385bc7f0 100644 --- a/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java +++ b/common/client/src/main/java/zingg/common/client/ILabelDataViewHelper.java @@ -8,7 +8,7 @@ public interface ILabelDataViewHelper { List getClusterIds(ZFrame lines); - List getDisplayColumns(ZFrame lines, IArguments args); +// List getDisplayColumns(ZFrame lines, IArguments args); ZFrame getCurrentPair(ZFrame lines, int index, List clusterIds, ZFrame clusterLines); diff --git a/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java index d359b6c0f..3584c501b 100644 --- a/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java +++ b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java @@ -7,9 +7,21 @@ public class FieldDefSelectedCols extends SelectedCols { - public FieldDefSelectedCols(List fieldDefs, boolean showConcise) { + protected FieldDefSelectedCols() { + + } + + public FieldDefSelectedCols(List fieldDefs, boolean showConcise) { + List colList = getColList(fieldDefs, showConcise); + setCols(colList); + } + + protected List getColList(List fieldDefs) { + return getColList(fieldDefs,false); + } - List namedList = new ArrayList<>(); + protected List getColList(List fieldDefs, boolean showConcise) { + List namedList = new ArrayList(); for (FieldDefinition fieldDef : fieldDefs) { if (showConcise && fieldDef.isDontUse()) { @@ -17,14 +29,12 @@ public FieldDefSelectedCols(List fieldDefs, boolean showConcise } namedList.add(fieldDef); } - - namedList.add(new FieldDefinition()); List stringList = convertNamedListToStringList(namedList); - setCols(stringList); - } + return stringList; + } - private List convertNamedListToStringList(List namedList) { - List stringList = new ArrayList<>(); + protected List convertNamedListToStringList(List namedList) { + List stringList = new ArrayList(); for (FieldDefinition named : namedList) { stringList.add(named.getName()); } diff --git a/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java b/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java index 8511ea43e..62f5aac70 100644 --- a/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java +++ b/common/client/src/main/java/zingg/common/client/cols/ZidAndFieldDefSelector.java @@ -1,16 +1,24 @@ package zingg.common.client.cols; -import java.util.Arrays; import java.util.List; +import zingg.common.client.FieldDefinition; import zingg.common.client.util.ColName; -public class ZidAndFieldDefSelector extends SelectedCols { +public class ZidAndFieldDefSelector extends FieldDefSelectedCols { - public ZidAndFieldDefSelector(String[] fieldDefs) { + public ZidAndFieldDefSelector(List fieldDefs) { + this(fieldDefs, true, false); + } + + public ZidAndFieldDefSelector(List fieldDefs, boolean includeZid, boolean showConcise) { + List colList = getColList(fieldDefs, showConcise); + + if (includeZid) colList.add(0, ColName.ID_COL); + + colList.add(ColName.SOURCE_COL); + + setCols(colList); + } - List fieldDefList = Arrays.asList(fieldDefs); - fieldDefList.add(0, ColName.ID_COL); - setCols(fieldDefList); - } } \ No newline at end of file diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java index 9948fd4f5..d5bd5970d 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelDataViewHelper.java @@ -6,11 +6,9 @@ import org.apache.commons.logging.LogFactory; import zingg.common.client.ClientOptions; -import zingg.common.client.IArguments; import zingg.common.client.ILabelDataViewHelper; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; import zingg.common.core.context.Context; @@ -39,11 +37,11 @@ public List getClusterIds(ZFrame lines) { } - @Override - public List getDisplayColumns(ZFrame lines, IArguments args) { - return getDSUtil().getFieldDefColumns(lines, args, false, args.getShowConcise()); - } - +// @Override +// public List getDisplayColumns(ZFrame lines, IArguments args) { +// return getDSUtil().getFieldDefColumns(lines, args, false, args.getShowConcise()); +// } +// @Override public ZFrame getCurrentPair(ZFrame lines, int index, List clusterIds, ZFrame clusterLines) { diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java index 0143dfd24..f8049329e 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java @@ -1,6 +1,5 @@ package zingg.common.core.executor; -import java.util.List; import java.util.Scanner; import org.apache.commons.logging.Log; @@ -8,6 +7,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; +import zingg.common.client.cols.ZidAndFieldDefSelector; import zingg.common.client.options.ZinggOptions; import zingg.common.client.pipe.Pipe; import zingg.common.client.util.ColName; @@ -125,14 +125,14 @@ protected ZFrame getUpdatedRecords(ZFrame updatedRecords, int } protected int getUserInput(ZFrame lines,ZFrame currentPair,String cluster_id) { - - List displayCols = getDSUtil().getFieldDefColumns(lines, args, false, args.getShowConcise()); - +// List displayCols = getDSUtil().getFieldDefColumns(lines, args, false, args.getShowConcise()); + ZidAndFieldDefSelector zidAndFieldDefSelector = new ZidAndFieldDefSelector(args.getFieldDefinition(), false, args.getShowConcise()); int matchFlag = currentPair.getAsInt(currentPair.head(),ColName.MATCH_FLAG_COL); String preMsg = String.format("\n\tThe record pairs belonging to the input cluster id %s are:", cluster_id); String matchType = LabelMatchType.get(matchFlag).msg; String postMsg = String.format("\tThe above pair is labeled as %s\n", matchType); - int selectedOption = displayRecordsAndGetUserInput(getDSUtil().select(currentPair, displayCols), preMsg, postMsg); +// int selectedOption = displayRecordsAndGetUserInput(getDSUtil().select(currentPair, displayCols), preMsg, postMsg); + int selectedOption = displayRecordsAndGetUserInput(currentPair.select(zidAndFieldDefSelector.getCols()), preMsg, postMsg); getTrainingDataModel().updateLabellerStat(selectedOption, INCREMENT); getTrainingDataModel().updateLabellerStat(matchFlag, -1*INCREMENT); getLabelDataViewHelper().printMarkedRecordsStat( diff --git a/common/core/src/main/java/zingg/common/core/executor/Labeller.java b/common/core/src/main/java/zingg/common/core/executor/Labeller.java index f58020a11..3c496445f 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Labeller.java +++ b/common/core/src/main/java/zingg/common/core/executor/Labeller.java @@ -10,6 +10,7 @@ import zingg.common.client.ITrainingDataModel; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; +import zingg.common.client.cols.ZidAndFieldDefSelector; import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; @@ -79,7 +80,8 @@ public ZFrame processRecordsCli(ZFrame lines) throws ZinggClientE ); lines = lines.cache(); - List displayCols = getLabelDataViewHelper().getDisplayColumns(lines, args); +// List displayCols = getLabelDataViewHelper().getDisplayColumns(lines, args); + ZidAndFieldDefSelector zidAndFieldDefSelector = new ZidAndFieldDefSelector(args.getFieldDefinition(), false, args.getShowConcise()); //have to introduce as snowframe can not handle row.getAs with column //name and row and lines are out of order for the code to work properly //snow getAsString expects row to have same struc as dataframe which is @@ -104,7 +106,8 @@ public ZFrame processRecordsCli(ZFrame lines) throws ZinggClientE msg2 = getLabelDataViewHelper().getMsg2(prediction, score); //String msgHeader = msg1 + msg2; - selectedOption = displayRecordsAndGetUserInput(getDSUtil().select(currentPair, displayCols), msg1, msg2); +// selectedOption = displayRecordsAndGetUserInput(getDSUtil().select(currentPair, displayCols), msg1, msg2); + selectedOption = displayRecordsAndGetUserInput(currentPair.select(zidAndFieldDefSelector.getCols()), msg1, msg2); getTrainingDataModel().updateLabellerStat(selectedOption, INCREMENT); getLabelDataViewHelper().printMarkedRecordsStat( getTrainingDataModel().getPositivePairsCount(), diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index dfcd050de..72586a1dc 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -8,6 +8,7 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; +import zingg.common.client.cols.ZidAndFieldDefSelector; import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; @@ -35,7 +36,9 @@ public ZFrame getTestData() throws ZinggClientException{ } public ZFrame getFieldDefColumnsDS(ZFrame testDataOriginal) { - return getDSUtil().getFieldDefColumnsDS(testDataOriginal, args, true); + ZidAndFieldDefSelector zidAndFieldDefSelector = new ZidAndFieldDefSelector(args.getFieldDefinition()); + return testDataOriginal.select(zidAndFieldDefSelector.getCols()); +// return getDSUtil().getFieldDefColumnsDS(testDataOriginal, args, true); } diff --git a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java index 625750a5d..3c2919688 100644 --- a/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java +++ b/common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java @@ -1,10 +1,13 @@ package zingg.common.core.executor; +import java.util.Arrays; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; +import zingg.common.client.cols.ZidAndFieldDefSelector; import zingg.common.client.options.ZinggOptions; import zingg.common.client.pipe.Pipe; import zingg.common.client.util.ColName; @@ -79,7 +82,7 @@ public void execute() throws ZinggClientException { if (negPairs!= null) negPairs = negPairs.cache(); //create random samples for blocking ZFrame sampleOrginal = data.sample(false, args.getLabelDataSampleSize()).repartition(args.getNumPartitions()).cache(); - sampleOrginal = getDSUtil().getFieldDefColumnsDS(sampleOrginal, args, true); + sampleOrginal = getFieldDefColumnsDS(sampleOrginal); LOG.info("Preprocessing DS for stopWords"); ZFrame sample = getStopWords().preprocessForStopWords(sampleOrginal); @@ -188,7 +191,7 @@ public ZFrame getPositiveSamples(ZFrame data) throws Exception { } ZFrame posSample = data.sample(false, args.getLabelDataSampleSize()); //select only those columns which are mentioned in the field definitions - posSample = getDSUtil().getFieldDefColumnsDS(posSample, args, true); + posSample = getFieldDefColumnsDS(posSample); if (LOG.isDebugEnabled()) { LOG.debug("Sampled " + posSample.count()); } @@ -202,8 +205,13 @@ public ZFrame getPositiveSamples(ZFrame data) throws Exception { return posPairs; } + protected ZFrame getFieldDefColumnsDS(ZFrame data) { + ZidAndFieldDefSelector zidAndFieldDefSelector = new ZidAndFieldDefSelector(args.getFieldDefinition()); + String[] cols = zidAndFieldDefSelector.getCols(); + return data.select(cols); + //return getDSUtil().getFieldDefColumnsDS(data, args, true); + } + protected abstract StopWordsRemover getStopWords(); - - } From a3dd3f358bd53a2e7d8da35e4cf5b05db65b2054 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 4 Mar 2024 17:26:14 +0530 Subject: [PATCH 090/219] show concise, dont use fields not to be used --- .../java/zingg/common/client/cols/FieldDefSelectedCols.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java index 3584c501b..f0cf06f86 100644 --- a/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java +++ b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java @@ -4,6 +4,7 @@ import java.util.List; import zingg.common.client.FieldDefinition; +import zingg.common.client.MatchType; public class FieldDefSelectedCols extends SelectedCols { @@ -24,7 +25,7 @@ protected List getColList(List fieldDefs, boo List namedList = new ArrayList(); for (FieldDefinition fieldDef : fieldDefs) { - if (showConcise && fieldDef.isDontUse()) { + if (showConcise && fieldDef.matchType.contains(MatchType.DONT_USE)) { continue; } namedList.add(fieldDef); From 3068d740648d00581679f944be4b929adba3b67d Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 5 Mar 2024 16:31:42 +0530 Subject: [PATCH 091/219] removed redundant getBlocks method --- .../src/main/java/zingg/common/core/executor/Linker.java | 7 +------ .../src/main/java/zingg/common/core/executor/Matcher.java | 4 ---- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index ed8c6fad1..8a437d0d9 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -13,6 +13,7 @@ public abstract class Linker extends Matcher { + private static final long serialVersionUID = 1L; protected static String name = "zingg.Linker"; public static final Log LOG = LogFactory.getLog(Linker.class); @@ -20,12 +21,6 @@ public Linker() { setZinggOption(ZinggOptions.LINK); } - public ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ - // THIS LOG IS NEEDED FOR PLAN CALCULATION USING COUNT, DO NOT REMOVE - LOG.info("in getBlocks, blocked count is " + blocked.count()); - return getDSUtil().joinWithItselfSourceSensitive(blocked, ColName.HASH_COL, args).cache(); - } - public ZFrame selectColsFromBlocked(ZFrame blocked) { return blocked; } diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 72586a1dc..f3624b85d 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -52,10 +52,6 @@ public ZFrame getBlocked( ZFrame testData) throws Exception, Zin - public ZFrame getBlocks(ZFrameblocked) throws Exception{ - return getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); - } - public ZFrame getBlocks(ZFrameblocked, ZFramebAll) throws Exception{ ZFramejoinH = getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); /*ZFramejoinH = blocked.as("first").joinOnCol(blocked.as("second"), ColName.HASH_COL) From 7206bd77c843e827d6a4e2576d42075784c4a5d7 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 5 Mar 2024 17:22:30 +0530 Subject: [PATCH 092/219] Revert "removed redundant getBlocks method" This reverts commit 3068d740648d00581679f944be4b929adba3b67d. --- .../src/main/java/zingg/common/core/executor/Linker.java | 7 ++++++- .../src/main/java/zingg/common/core/executor/Matcher.java | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 8a437d0d9..ed8c6fad1 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -13,7 +13,6 @@ public abstract class Linker extends Matcher { - private static final long serialVersionUID = 1L; protected static String name = "zingg.Linker"; public static final Log LOG = LogFactory.getLog(Linker.class); @@ -21,6 +20,12 @@ public Linker() { setZinggOption(ZinggOptions.LINK); } + public ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ + // THIS LOG IS NEEDED FOR PLAN CALCULATION USING COUNT, DO NOT REMOVE + LOG.info("in getBlocks, blocked count is " + blocked.count()); + return getDSUtil().joinWithItselfSourceSensitive(blocked, ColName.HASH_COL, args).cache(); + } + public ZFrame selectColsFromBlocked(ZFrame blocked) { return blocked; } diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index f3624b85d..72586a1dc 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -52,6 +52,10 @@ public ZFrame getBlocked( ZFrame testData) throws Exception, Zin + public ZFrame getBlocks(ZFrameblocked) throws Exception{ + return getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); + } + public ZFrame getBlocks(ZFrameblocked, ZFramebAll) throws Exception{ ZFramejoinH = getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); /*ZFramejoinH = blocked.as("first").joinOnCol(blocked.as("second"), ColName.HASH_COL) From fcb9e2bd1c18504d8fe72b1e3bd089121d1fac03 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 5 Mar 2024 17:23:53 +0530 Subject: [PATCH 093/219] removed redundant getBlocks method --- .../src/main/java/zingg/common/core/executor/Matcher.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 72586a1dc..17109e264 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -49,13 +49,7 @@ public ZFrame getBlocked( ZFrame testData) throws Exception, Zin ZFrame blocked1 = blocked.repartition(args.getNumPartitions(), blocked.col(ColName.HASH_COL)); //.cache(); return blocked1; } - - - public ZFrame getBlocks(ZFrameblocked) throws Exception{ - return getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); - } - public ZFrame getBlocks(ZFrameblocked, ZFramebAll) throws Exception{ ZFramejoinH = getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); /*ZFramejoinH = blocked.as("first").joinOnCol(blocked.as("second"), ColName.HASH_COL) From e910eb8e0d57b75617226b31ed22aa0bc8035221 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 6 Mar 2024 17:58:18 +0530 Subject: [PATCH 094/219] use IPairBuilder for building pairs --- .../zingg/common/core/executor/Linker.java | 22 +++++--- .../zingg/common/core/executor/Matcher.java | 42 +++++++------- .../zingg/common/core/pairs/IPairBuilder.java | 9 +++ .../common/core/pairs/SelfPairBuilder.java | 55 +++++++++++++++++++ .../pairs/SelfPairBuilderSourceSensitive.java | 26 +++++++++ 5 files changed, 125 insertions(+), 29 deletions(-) create mode 100644 common/core/src/main/java/zingg/common/core/pairs/IPairBuilder.java create mode 100644 common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java create mode 100644 common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilderSourceSensitive.java diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index ed8c6fad1..d97588d28 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -8,28 +8,27 @@ import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; +import zingg.common.core.pairs.IPairBuilder; +import zingg.common.core.pairs.SelfPairBuilderSourceSensitive; public abstract class Linker extends Matcher { + private static final long serialVersionUID = 1L; protected static String name = "zingg.Linker"; public static final Log LOG = LogFactory.getLog(Linker.class); public Linker() { setZinggOption(ZinggOptions.LINK); } - - public ZFrame getBlocks(ZFrame blocked, ZFrame bAll) throws Exception{ - // THIS LOG IS NEEDED FOR PLAN CALCULATION USING COUNT, DO NOT REMOVE - LOG.info("in getBlocks, blocked count is " + blocked.count()); - return getDSUtil().joinWithItselfSourceSensitive(blocked, ColName.HASH_COL, args).cache(); - } - + + @Override public ZFrame selectColsFromBlocked(ZFrame blocked) { return blocked; } + @Override public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws ZinggClientException { try { // input dupes are pairs @@ -53,12 +52,19 @@ public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws } } + @Override public ZFrame getDupesActualForGraph(ZFrame dupes) { ZFrame dupesActual = dupes .filter(dupes.equalTo(ColName.PREDICTION_COL, ColValues.IS_MATCH_PREDICTION)); return dupesActual; } - + @Override + public IPairBuilder getIPairBuilder() { + if(iPairBuilder==null) { + iPairBuilder = new SelfPairBuilderSourceSensitive (getDSUtil(),args); + } + return iPairBuilder; + } } diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 17109e264..858236f4c 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -15,6 +15,8 @@ import zingg.common.core.block.Canopy; import zingg.common.core.block.Tree; import zingg.common.core.model.Model; +import zingg.common.core.pairs.IPairBuilder; +import zingg.common.core.pairs.SelfPairBuilder; import zingg.common.core.preprocess.StopWordsRemover; import zingg.common.core.util.Analytics; import zingg.common.core.util.Metric; @@ -25,6 +27,7 @@ public abstract class Matcher extends ZinggBase{ protected static String name = "zingg.Matcher"; public static final Log LOG = LogFactory.getLog(Matcher.class); + protected IPairBuilder iPairBuilder; public Matcher() { setZinggOption(ZinggOptions.MATCH); @@ -50,26 +53,8 @@ public ZFrame getBlocked( ZFrame testData) throws Exception, Zin return blocked1; } - public ZFrame getBlocks(ZFrameblocked, ZFramebAll) throws Exception{ - ZFramejoinH = getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); - /*ZFramejoinH = blocked.as("first").joinOnCol(blocked.as("second"), ColName.HASH_COL) - .selectExpr("first.z_zid as z_zid", "second.z_zid as z_z_zid"); - */ - //joinH.show(); - joinH = joinH.filter(joinH.gt(ColName.ID_COL)); - LOG.warn("Num comparisons " + joinH.count()); - joinH = joinH.repartition(args.getNumPartitions(), joinH.col(ColName.ID_COL)); - bAll = bAll.repartition(args.getNumPartitions(), bAll.col(ColName.ID_COL)); - joinH = joinH.joinOnCol(bAll, ColName.ID_COL); - LOG.warn("Joining with actual values"); - //joinH.show(); - bAll = getDSUtil().getPrefixedColumnsDS(bAll); - //bAll.show(); - joinH = joinH.repartition(args.getNumPartitions(), joinH.col(ColName.COL_PREFIX + ColName.ID_COL)); - joinH = joinH.joinOnCol(bAll, ColName.COL_PREFIX + ColName.ID_COL); - LOG.warn("Joining again with actual values"); - //joinH.show(); - return joinH; + public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Exception{ + return getIPairBuilder().getPairs(blocked, bAll); } protected abstract Model getModel() throws ZinggClientException; @@ -91,7 +76,7 @@ protected ZFrame predictOnBlocks(ZFrameblocks) throws Exception, Z } protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData) throws Exception, ZinggClientException{ - ZFrame blocks = getBlocks(selectColsFromBlocked(blocked), testData); + ZFrame blocks = getPairs(selectColsFromBlocked(blocked), testData); ZFramedupesActual = predictOnBlocks(blocks); return getDupesActualForGraph(dupesActual); } @@ -285,6 +270,21 @@ protected ZFrame selectColsFromDupes(ZFramedupesActual) { protected abstract StopWordsRemover getStopWords(); + /** + * Each sub class of matcher can inject it's own iPairBuilder implementation + * @return + */ + public IPairBuilder getIPairBuilder() { + if(iPairBuilder==null) { + iPairBuilder = new SelfPairBuilder (getDSUtil(),args); + } + return iPairBuilder; + } + + public void setIPairBuilder(IPairBuilder iPairBuilder) { + this.iPairBuilder = iPairBuilder; + } + } diff --git a/common/core/src/main/java/zingg/common/core/pairs/IPairBuilder.java b/common/core/src/main/java/zingg/common/core/pairs/IPairBuilder.java new file mode 100644 index 000000000..235483818 --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/pairs/IPairBuilder.java @@ -0,0 +1,9 @@ +package zingg.common.core.pairs; + +import zingg.common.client.ZFrame; + +public interface IPairBuilder { + + public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Exception; + +} diff --git a/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java b/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java new file mode 100644 index 000000000..4d0fff71d --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java @@ -0,0 +1,55 @@ +package zingg.common.core.pairs; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.IArguments; +import zingg.common.client.ZFrame; +import zingg.common.client.util.ColName; +import zingg.common.client.util.DSUtil; + +public class SelfPairBuilder implements IPairBuilder { + + protected DSUtil dsUtil; + public static final Log LOG = LogFactory.getLog(SelfPairBuilder.class); + protected IArguments args; + + public SelfPairBuilder(DSUtil dsUtil, IArguments args) { + this.dsUtil = dsUtil; + this.args = args; + } + + @Override + public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Exception { + ZFramejoinH = getDSUtil().joinWithItself(blocked, ColName.HASH_COL, true).cache(); + /*ZFramejoinH = blocked.as("first").joinOnCol(blocked.as("second"), ColName.HASH_COL) + .selectExpr("first.z_zid as z_zid", "second.z_zid as z_z_zid"); + */ + //joinH.show(); + joinH = joinH.filter(joinH.gt(ColName.ID_COL)); + LOG.warn("Num comparisons " + joinH.count()); + joinH = joinH.repartition(args.getNumPartitions(), joinH.col(ColName.ID_COL)); + bAll = bAll.repartition(args.getNumPartitions(), bAll.col(ColName.ID_COL)); + joinH = joinH.joinOnCol(bAll, ColName.ID_COL); + LOG.warn("Joining with actual values"); + //joinH.show(); + bAll = getDSUtil().getPrefixedColumnsDS(bAll); + //bAll.show(); + joinH = joinH.repartition(args.getNumPartitions(), joinH.col(ColName.COL_PREFIX + ColName.ID_COL)); + joinH = joinH.joinOnCol(bAll, ColName.COL_PREFIX + ColName.ID_COL); + LOG.warn("Joining again with actual values"); + //joinH.show(); + return joinH; + } + + public DSUtil getDSUtil() { + return dsUtil; + } + + public void setDSUtil(DSUtil dsUtil) { + this.dsUtil = dsUtil; + } + + + +} diff --git a/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilderSourceSensitive.java b/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilderSourceSensitive.java new file mode 100644 index 000000000..293eb162c --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilderSourceSensitive.java @@ -0,0 +1,26 @@ +package zingg.common.core.pairs; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.IArguments; +import zingg.common.client.ZFrame; +import zingg.common.client.util.ColName; +import zingg.common.client.util.DSUtil; + +public class SelfPairBuilderSourceSensitive extends SelfPairBuilder { + + public static final Log LOG = LogFactory.getLog(SelfPairBuilderSourceSensitive.class); + + public SelfPairBuilderSourceSensitive(DSUtil dsUtil, IArguments args) { + super(dsUtil, args); + } + + @Override + public ZFrame getPairs(ZFrame blocked, ZFrame bAll) throws Exception{ + // THIS LOG IS NEEDED FOR PLAN CALCULATION USING COUNT, DO NOT REMOVE + LOG.info("in getBlocks, blocked count is " + blocked.count()); + return getDSUtil().joinWithItselfSourceSensitive(blocked, ColName.HASH_COL, args).cache(); + } + +} From bc37dfa61fae3bf52eacd9500e672e9b15ad11e7 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 7 Mar 2024 11:17:49 +0530 Subject: [PATCH 095/219] refactor use overloaded getPairs method --- .../zingg/common/core/executor/Linker.java | 14 ++++------- .../zingg/common/core/executor/Matcher.java | 24 ++++--------------- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index d97588d28..2dbbaa66c 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -8,7 +8,6 @@ import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; -import zingg.common.core.pairs.IPairBuilder; import zingg.common.core.pairs.SelfPairBuilderSourceSensitive; @@ -27,6 +26,11 @@ public Linker() { public ZFrame selectColsFromBlocked(ZFrame blocked) { return blocked; } + + @Override + public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Exception{ + return getPairs(blocked, bAll, new SelfPairBuilderSourceSensitive (getDSUtil(),args)); + } @Override public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws ZinggClientException { @@ -59,12 +63,4 @@ public ZFrame getDupesActualForGraph(ZFrame dupes) { return dupesActual; } - @Override - public IPairBuilder getIPairBuilder() { - if(iPairBuilder==null) { - iPairBuilder = new SelfPairBuilderSourceSensitive (getDSUtil(),args); - } - return iPairBuilder; - } - } diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 858236f4c..c3c23fc89 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -27,8 +27,6 @@ public abstract class Matcher extends ZinggBase{ protected static String name = "zingg.Matcher"; public static final Log LOG = LogFactory.getLog(Matcher.class); - protected IPairBuilder iPairBuilder; - public Matcher() { setZinggOption(ZinggOptions.MATCH); } @@ -54,7 +52,11 @@ public ZFrame getBlocked( ZFrame testData) throws Exception, Zin } public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Exception{ - return getIPairBuilder().getPairs(blocked, bAll); + return getPairs(blocked, bAll, new SelfPairBuilder (getDSUtil(),args)); + } + + public ZFrame getPairs(ZFrameblocked, ZFramebAll, IPairBuilder iPairBuilder) throws Exception{ + return iPairBuilder.getPairs(blocked, bAll); } protected abstract Model getModel() throws ZinggClientException; @@ -270,21 +272,5 @@ protected ZFrame selectColsFromDupes(ZFramedupesActual) { protected abstract StopWordsRemover getStopWords(); - /** - * Each sub class of matcher can inject it's own iPairBuilder implementation - * @return - */ - public IPairBuilder getIPairBuilder() { - if(iPairBuilder==null) { - iPairBuilder = new SelfPairBuilder (getDSUtil(),args); - } - return iPairBuilder; - } - - public void setIPairBuilder(IPairBuilder iPairBuilder) { - this.iPairBuilder = iPairBuilder; - } - - } From 95b6c766365085e76270e91771528cf41b21f0c6 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Thu, 7 Mar 2024 13:21:55 +0530 Subject: [PATCH 096/219] Revert "Merge pull request #788 from gnanaprakash-ravi/EntepriseIssue784" This reverts commit b8316e6e27f44c88ada181e08ad383d8c1c0e9bd, reversing changes made to 5fd1b385f07dd8bdead04e7b84babca95cef492c. --- .../java/zingg/common/client/Arguments.java | 2 +- .../zingg/common/client/FieldDefinition.java | 1 - .../java/zingg/common/client/pipe/Pipe.java | 2 +- .../common/py/annotations/PythonClass.java | 8 +- .../common/py/annotations/PythonMethod.java | 2 + .../py/processors/PythonClassProcessor.java | 123 ++++++++---------- .../py/processors/PythonMethodProcessor.java | 52 +++++++- examples/febrl/GeneratedFebrlExample.py | 7 +- python/MANIFEST.in | 1 - python/setup.py | 10 +- .../client.py => zingg/ArgumentsGenerated.py} | 37 ------ python/zingg/FieldDefinitionGenerated.py | 37 ++++++ .../pipes.py => zingg/PipeGenerated.py} | 0 python/{zinggOld => zingg}/client.py | 0 python/zingg/otherThanGeneratedArguments.py | 2 +- .../otherThanGeneratedFieldDefinition.py | 2 +- python/zingg/otherThanGeneratedPipe.py | 2 +- python/{zinggOld => zingg}/pipes.py | 0 18 files changed, 156 insertions(+), 132 deletions(-) rename python/{zinggGenerated/client.py => zingg/ArgumentsGenerated.py} (80%) create mode 100644 python/zingg/FieldDefinitionGenerated.py rename python/{zinggGenerated/pipes.py => zingg/PipeGenerated.py} (100%) rename python/{zinggOld => zingg}/client.py (100%) rename python/{zinggOld => zingg}/pipes.py (100%) diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index 47474c310..5116a5cd9 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -81,7 +81,7 @@ * } * */ -@PythonClass(module = "client", outputDirectory = "python/zinggGenerated") +@PythonClass @JsonInclude(Include.NON_NULL) public class Arguments implements Serializable, IArguments { diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index 0adbd9e1a..f162a9107 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -34,7 +34,6 @@ * @author sgoyal * */ -@PythonClass(module = "client", outputDirectory = "python/zinggGenerated") public class FieldDefinition implements Named, Serializable { diff --git a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java index e726160d3..aab0878b1 100644 --- a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java +++ b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java @@ -20,7 +20,7 @@ * @author sgoyal * */ -@PythonClass(module = "pipes", outputDirectory = "python/zinggGenerated") +@PythonClass @JsonInclude(Include.NON_NULL) public class Pipe implements Serializable{ // St:StructType, Sv:SaveMode diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java index e557f9a3c..0d3bf21a5 100644 --- a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java +++ b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java @@ -1,11 +1,9 @@ package zingg.common.py.annotations; +import javax.annotation.processing.*; + import java.lang.annotation.Target; import java.lang.annotation.ElementType; @Target({ElementType.TYPE}) -public @interface PythonClass { - String module(); - String parent() default ""; - String outputDirectory(); -} \ No newline at end of file +public @interface PythonClass {} \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java b/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java index a37807d90..f59a9c038 100644 --- a/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java +++ b/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java @@ -1,5 +1,7 @@ package zingg.common.py.annotations; +import javax.annotation.processing.*; + import java.lang.annotation.Target; import java.lang.annotation.ElementType; diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 3f41ac899..17bd7bdba 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -3,8 +3,10 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; -import java.util.HashSet; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import javax.annotation.processing.*; import java.util.Set; @@ -18,88 +20,66 @@ @SupportedAnnotationTypes("zingg.common.py.annotations.PythonClass") public class PythonClassProcessor extends AbstractProcessor { - private Set processedElements = new HashSet<>(); - private Set folders = new HashSet<>(); + private Map> classMethodsMap = new HashMap<>(); @Override public synchronized void init(ProcessingEnvironment processingEnv) { - System.out.println("ProcessingEnv " + processingEnv); super.init(processingEnv); - - // Clear the output directory on initialization - folders.add("python/zinggGenerated"); - folders.add("common/python"); - folders.add("snowflake/python"); - folders.add("spark/python"); - - for (String folder : folders) { - File directory = new File(folder); - if (directory.exists()) { - for (File file : directory.listFiles()) { - file.delete(); - System.out.println(file + "deeellleeeeteeed"); - System.out.println(file + "geeneerateedddd"); - } - } - } } @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { - // Process each PythonClass annotated element + + // process Services annotation for (Element element : roundEnv.getElementsAnnotatedWith(PythonClass.class)) { - if (element.getKind() == ElementKind.CLASS && !processedElements.contains(element)) { - processClass((TypeElement) element, roundEnv); + if (element.getKind() == ElementKind.CLASS) { + TypeElement classElement = (TypeElement) element; + PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); + String packageName = packageElement.getQualifiedName().toString(); + List methodNames = new ArrayList<>(); + + String outputDirectory = determineOutputDirectory(packageName); + + try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) { + generateImportsAndDeclarations(element, fileWriter); + + fileWriter.write("class " + element.getSimpleName() + ":\n"); + + // __init__ method + fileWriter.write(" def __init__(self" + generateConstructorParameters(classElement, element) + "):\n"); + generateClassInitializationCode(classElement, element, fileWriter); + + for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { + if (methodElement.getAnnotation(PythonMethod.class) != null) { + methodNames.add(methodElement.getSimpleName().toString()); + } + } + classMethodsMap.put(element.getSimpleName().toString(), methodNames); + } catch (IOException e) { + e.printStackTrace(); + } } } + ProcessorContext processorContext = ProcessorContext.getInstance(); + processorContext.getClassMethodsMap().putAll(classMethodsMap); + return false; } + Map> getClassMethodsMap() { + return classMethodsMap; + } - private void processClass(TypeElement classElement, RoundEnvironment roundEnv) { - - // Mark the class as processed - processedElements.add(classElement); - - PythonClass pythonClassAnnotation = classElement.getAnnotation(PythonClass.class); - - String outputDirectory = pythonClassAnnotation.outputDirectory(); - String moduleName = pythonClassAnnotation.module(); - String outputFile = outputDirectory + File.separator + moduleName + ".py"; - String parentClassName = pythonClassAnnotation.parent(); - - try (FileWriter fileWriter = new FileWriter(outputFile, true)) { - generateImportsAndDeclarations(classElement, fileWriter); - - if (!parentClassName.isEmpty()) { - fileWriter.write("class " + classElement.getSimpleName() + "(" + parentClassName + "):\n"); - } else { - fileWriter.write("class " + classElement.getSimpleName() + ":\n"); - } - // System.out.println(classElement.getSimpleName() + "ccccccccccccccccccccccccc"); - - // __init__ method - fileWriter.write(" def __init__(self" + generateConstructorParameters(classElement, classElement) + "):\n"); - generateClassInitializationCode(classElement, classElement, fileWriter); - - for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { - if (methodElement.getAnnotation(PythonMethod.class) != null) { - String javadoc = processingEnv.getElementUtils().getDocComment(methodElement); - if (javadoc != null) { - fileWriter.write(" '''\n"); - fileWriter.write(javadoc.trim()); - fileWriter.write("\n '''\n"); - } - - fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + PythonMethodProcessor.generateMethodSignature(methodElement) + "):\n"); - PythonMethodProcessor.generateMethodReturn(methodElement, fileWriter); - PythonMethodProcessor.generateFieldAssignment(methodElement, fileWriter); - fileWriter.write("\n"); - } - } - } catch (IOException e) { - e.printStackTrace(); + private String determineOutputDirectory(String packageName) { + if (packageName.contains("enterprise") && packageName.contains("common")) { + return "common/python"; + } else if (packageName.contains("enterprise") && packageName.contains("snowflake")) { + return "snowflake/python"; + } else if (packageName.contains("enterprise") && packageName.contains("spark")) { + return "spark/python"; + } else { + return "python/zingg"; } } @@ -152,6 +132,15 @@ else if (element.getSimpleName().contentEquals("FieldDefinition")) { fileWriter.write("\n"); } + // private void generateFieldInitializationCode(VariableElement field, Element element) { + // String fieldName = field.getSimpleName().toString(); + // String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; + + // if (!fieldName.startsWith("FORMAT_")) { + // System.out.println(" " + fieldAssignment); + // } + // } + private String generateConstructorParameters(TypeElement classElement, Element element) { StringBuilder parameters = new StringBuilder(); diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 7781edb6d..1971adcb2 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -1,8 +1,10 @@ package zingg.common.py.processors; +import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.List; +import java.util.Map; import javax.annotation.processing.*; import javax.lang.model.type.TypeMirror; @@ -10,22 +12,59 @@ import java.util.Set; import javax.lang.model.element.*; +import zingg.common.py.annotations.*; @SupportedAnnotationTypes("zingg.common.py.annotations.PythonMethod") public class PythonMethodProcessor extends AbstractProcessor { + + private Map> classMethodsMap; @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { + + ProcessorContext processorContext = ProcessorContext.getInstance(); + classMethodsMap = processorContext.getClassMethodsMap(); + + for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { + + if (element.getKind() == ElementKind.METHOD) { + ExecutableElement methodElement = (ExecutableElement) element; + String className = methodElement.getEnclosingElement().getSimpleName().toString(); + + if (classMethodsMap.containsKey(className)) { + List methodNames = classMethodsMap.get(className); + + if (methodNames.contains(methodElement.getSimpleName().toString())) { + try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) { + + String javadoc = processingEnv.getElementUtils().getDocComment(methodElement); + if (javadoc != null) { + fileWriter.write(" '''\n"); + fileWriter.write(javadoc.trim()); + fileWriter.write("\n '''\n"); + } + + fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n"); + generateMethodReturn(methodElement, fileWriter); + generateFieldAssignment(methodElement, fileWriter); + fileWriter.write("\n"); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + } return false; } - public static String generateMethodSignature(ExecutableElement methodElement) { + private String generateMethodSignature(ExecutableElement methodElement) { StringBuilder signature = new StringBuilder(); signature.append(generateMethodParameters(methodElement)); return signature.toString(); } - public static String generateMethodParameters(ExecutableElement methodElement) { + private String generateMethodParameters(ExecutableElement methodElement) { StringBuilder parameters = new StringBuilder(); for (VariableElement parameter : methodElement.getParameters()) { parameters.append(", "); @@ -34,18 +73,23 @@ public static String generateMethodParameters(ExecutableElement methodElement) { return parameters.toString(); } - public static void generateMethodReturn(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { + private void generateMethodReturn(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { TypeMirror returnType = methodElement.getReturnType(); if (returnType.getKind() == TypeKind.VOID) { return; } else { + String returnTypeString = resolveType(returnType); String methodName = methodElement.getSimpleName().toString(); String className = methodElement.getEnclosingElement().getSimpleName().toString(); fileWriter.write(" return self." + className.toLowerCase() + "." + methodName + "()\n"); } } - public static void generateFieldAssignment(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { + private String resolveType(TypeMirror typeMirror) { + return typeMirror.toString(); + } + + private void generateFieldAssignment(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { List parameters = methodElement.getParameters(); if (!parameters.isEmpty()) { diff --git a/examples/febrl/GeneratedFebrlExample.py b/examples/febrl/GeneratedFebrlExample.py index e667c2429..54c64e77e 100644 --- a/examples/febrl/GeneratedFebrlExample.py +++ b/examples/febrl/GeneratedFebrlExample.py @@ -1,5 +1,6 @@ -from zingg.zinggGenerated.client import * -from zingg.zinggGenerated.pipes import * +from zingg.ArgumentsGenerated import * +from zingg.FieldDefinitionGenerated import * +from zingg.PipeGenerated import * from zingg.otherThanGenerated import * from zingg.otherThanGeneratedPipe import * from zingg.otherThanGeneratedArguments import * @@ -38,7 +39,7 @@ args.setOutput(outputPipe) -options = ClientOptions([ClientOptions.PHASE,"match"]) +options = ClientOptions([ClientOptions.PHASE,"findTrainingData"]) #Zingg execution for the given phase zingg = Zingg(args, options) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 4caa178c5..b9582aea0 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -11,5 +11,4 @@ recursive-include zingg/examples/amazon-google * recursive-include zingg/examples/febrl * recursive-include zingg/models * recursive-include zingg/phases *.py -recursive-include zingg/zinggGenerated *.py recursive-include zingg/config * diff --git a/python/setup.py b/python/setup.py index 3be6a72ff..514c2e180 100644 --- a/python/setup.py +++ b/python/setup.py @@ -56,7 +56,6 @@ DATA_PATH = os.path.join(ZINGG_HOME, "models") CONF_PATH = os.path.join(ZINGG_HOME, "config") PHASES_PATH = os.path.join(ZINGG_HOME, "python/phases") -GENERATEDCODE_PATH = os.path.join(ZINGG_HOME, "python/zinggGenerated") SCRIPTS_TARGET = os.path.join("zingg", "scripts") JARS_TARGET = os.path.join("zingg", "jars") @@ -64,7 +63,6 @@ DATA_TARGET = os.path.join("zingg", "models") CONF_TARGET = os.path.join("zingg", "config") PHASES_TARGET = os.path.join("zingg", "phases") -GENERATEDCODE_TARGET = os.path.join("zingg", "zinggGenerated") # Check and see if we are under the Zingg path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Zingg otherwise we @@ -114,7 +112,6 @@ def run(self): os.symlink(DATA_PATH, DATA_TARGET) os.symlink(CONF_PATH, CONF_TARGET) os.symlink(PHASES_PATH, PHASES_TARGET) - os.symlink(GENERATEDCODE_PATH, GENERATEDCODE_TARGET) else: # For windows fall back to the slower copytree copytree(JARS_PATH, JARS_TARGET) @@ -123,7 +120,6 @@ def run(self): copytree(DATA_PATH, DATA_TARGET) copytree(CONF_PATH, CONF_TARGET) copytree(PHASES_PATH, PHASES_TARGET) - copytree(GENERATEDCODE_PATH, GENERATEDCODE_TARGET) else: # If we are not inside of ZINGG_HOME verify we have the required symlink farm if not os.path.exists(JARS_TARGET): @@ -162,8 +158,7 @@ def run(self): 'zingg.data': 'zingg/models', 'zingg.examples': 'zingg/examples', 'zingg.conf': 'zingg/config', - 'zingg.phases': 'zingg/phases', - 'zingg.zinggGenerated': 'zingg/zinggGenerated' + 'zingg.phases': 'zingg/phases' }, package_data={ 'zingg.jars': ['*.jar'], @@ -172,7 +167,6 @@ def run(self): 'zingg.examples': ['*.py', '*/examples/*.py'], 'zingg.conf': ['*'], 'zingg.phases': ['*'], - 'zingg.zinggGenerated': ['*'], '':['*.py'], '':['LICENSE'] }, @@ -204,7 +198,6 @@ def run(self): os.remove(os.path.join("zingg", "examples")) os.remove(os.path.join("zingg", "phases")) os.remove(os.path.join("zingg", "config")) - os.remove(os.path.join("zingg", "zinggGenerated")) else: rmtree(os.path.join("zingg", "jars")) rmtree(os.path.join("zingg", "scripts")) @@ -212,4 +205,3 @@ def run(self): rmtree(os.path.join("zingg", "examples")) rmtree(os.path.join("zingg", "phases")) rmtree(os.path.join("zingg", "config")) - rmtree(os.path.join("zingg", "zinggGenerated")) diff --git a/python/zinggGenerated/client.py b/python/zingg/ArgumentsGenerated.py similarity index 80% rename from python/zinggGenerated/client.py rename to python/zingg/ArgumentsGenerated.py index 5045a8ca2..bafb8d96d 100644 --- a/python/zinggGenerated/client.py +++ b/python/zingg/ArgumentsGenerated.py @@ -1,40 +1,3 @@ -from zingg.otherThanGenerated import * -''' -This class defines each field that we use in matching We can use this to - configure the properties of each field we use for matching in Zingg. - - @author sgoyal -''' -class FieldDefinition: - def __init__(self, name, dataType, *matchType): - self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() - self.fielddefinition.setFieldName(name) - self.fielddefinition.setDataType(self.stringify(dataType)) - self.fielddefinition.setMatchType(matchType) - self.fielddefinition.setFields(name) - - def getFieldDefinition(self): - return self.fielddefinition - - def setFields(self, fields): - self.fielddefinition.setFields(fields) - - ''' -Set the field type which defines the kind of matching we want to do - - @see MatchType - @param type - the type to set - ''' - def setMatchType(self, type): - self.fielddefinition.setMatchType(type) - - def setStopWords(self, stopWords): - self.fielddefinition.setStopWords(stopWords) - - def setFieldName(self, fieldName): - self.fielddefinition.setFieldName(fieldName) - from zingg.otherThanGenerated import * ''' This class helps supply match arguments to Zingg. There are 3 basic steps diff --git a/python/zingg/FieldDefinitionGenerated.py b/python/zingg/FieldDefinitionGenerated.py new file mode 100644 index 000000000..b08d75984 --- /dev/null +++ b/python/zingg/FieldDefinitionGenerated.py @@ -0,0 +1,37 @@ +from zingg.otherThanGenerated import * +''' +This class defines each field that we use in matching We can use this to + configure the properties of each field we use for matching in Zingg. + + @author sgoyal +''' +class FieldDefinition: + def __init__(self, name, dataType, *matchType): + self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() + self.fielddefinition.setFieldName(name) + self.fielddefinition.setDataType(self.stringify(dataType)) + self.fielddefinition.setMatchType(matchType) + self.fielddefinition.setFields(name) + + def getFieldDefinition(self): + return self.fielddefinition + + def setFields(self, fields): + self.fielddefinition.setFields(fields) + + ''' +Set the field type which defines the kind of matching we want to do + + @see MatchType + @param type + the type to set + ''' + def setMatchType(self, type): + self.fielddefinition.setMatchType(type) + + def setStopWords(self, stopWords): + self.fielddefinition.setStopWords(stopWords) + + def setFieldName(self, fieldName): + self.fielddefinition.setFieldName(fieldName) + diff --git a/python/zinggGenerated/pipes.py b/python/zingg/PipeGenerated.py similarity index 100% rename from python/zinggGenerated/pipes.py rename to python/zingg/PipeGenerated.py diff --git a/python/zinggOld/client.py b/python/zingg/client.py similarity index 100% rename from python/zinggOld/client.py rename to python/zingg/client.py diff --git a/python/zingg/otherThanGeneratedArguments.py b/python/zingg/otherThanGeneratedArguments.py index 5abe1c7f5..113d08ead 100644 --- a/python/zingg/otherThanGeneratedArguments.py +++ b/python/zingg/otherThanGeneratedArguments.py @@ -1,4 +1,4 @@ -from zingg.zinggGenerated.client import * +from zingg.ArgumentsGenerated import * from zingg.otherThanGeneratedFieldDefinition import * class ExtendedArgumentsGenerated(Arguments): diff --git a/python/zingg/otherThanGeneratedFieldDefinition.py b/python/zingg/otherThanGeneratedFieldDefinition.py index 195499d07..43f3d229e 100644 --- a/python/zingg/otherThanGeneratedFieldDefinition.py +++ b/python/zingg/otherThanGeneratedFieldDefinition.py @@ -1,4 +1,4 @@ -from zingg.zinggGenerated.client import * +from zingg.FieldDefinitionGenerated import * class ExtendedFieldDefinitionGenerated(FieldDefinition): def __init__(self, name, dataType, *matchType): diff --git a/python/zingg/otherThanGeneratedPipe.py b/python/zingg/otherThanGeneratedPipe.py index e405b3386..a46df2794 100644 --- a/python/zingg/otherThanGeneratedPipe.py +++ b/python/zingg/otherThanGeneratedPipe.py @@ -1,4 +1,4 @@ -from zingg.zinggGenerated.pipes import * +from zingg.PipeGenerated import * class ExtendedPipeGenerated(Pipe): def __init__(self, name, format): diff --git a/python/zinggOld/pipes.py b/python/zingg/pipes.py similarity index 100% rename from python/zinggOld/pipes.py rename to python/zingg/pipes.py From 330a073b2a4caf7c775acfcfac6aaee4e799e580 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Thu, 7 Mar 2024 13:25:22 +0530 Subject: [PATCH 097/219] Revert "javadoc support for generated python code" This reverts commit 722c43d177e97ff2823da716b813d9fb15679fe0. --- .../py/processors/PythonClassProcessor.java | 7 - .../py/processors/PythonMethodProcessor.java | 8 -- python/zingg/ArgumentsGenerated.py | 125 ------------------ python/zingg/FieldDefinitionGenerated.py | 13 -- python/zingg/PipeGenerated.py | 5 - 5 files changed, 158 deletions(-) diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 17bd7bdba..879c6feb4 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -94,13 +94,6 @@ private void generateImportsAndDeclarations(Element element, FileWriter fileWrit fileWriter.write("JStructType = getJVM().org.apache.spark.sql.types.StructType\n"); fileWriter.write("\n"); } - - String javadoc = processingEnv.getElementUtils().getDocComment(element); - if (javadoc != null) { - fileWriter.write("'''\n"); - fileWriter.write(javadoc.trim()); - fileWriter.write("\n'''\n"); - } } private void generateClassInitializationCode(TypeElement classElement, Element element, FileWriter fileWriter) throws IOException { diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 1971adcb2..6de712703 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -36,14 +36,6 @@ public boolean process(Set annotations, RoundEnvironment if (methodNames.contains(methodElement.getSimpleName().toString())) { try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) { - - String javadoc = processingEnv.getElementUtils().getDocComment(methodElement); - if (javadoc != null) { - fileWriter.write(" '''\n"); - fileWriter.write(javadoc.trim()); - fileWriter.write("\n '''\n"); - } - fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n"); generateMethodReturn(methodElement, fileWriter); generateFieldAssignment(methodElement, fileWriter); diff --git a/python/zingg/ArgumentsGenerated.py b/python/zingg/ArgumentsGenerated.py index bafb8d96d..e03fc961c 100644 --- a/python/zingg/ArgumentsGenerated.py +++ b/python/zingg/ArgumentsGenerated.py @@ -1,64 +1,4 @@ from zingg.otherThanGenerated import * -''' -This class helps supply match arguments to Zingg. There are 3 basic steps - in any match process. -
    -
  • Defining - specifying information about data location, fields and our - notion of similarity. -
  • Training - making Zingg learn the matching rules -
  • Matching - Running the models on entire dataset -
-

- There is another step, creating labeled data, which can be used to create - training data if none is present. Let us cover them in greater detail through - an example. -

- We have some positive and negative labeled examples from which we want - Zingg to learn. These are saved in -

- /path/to/training/data/positive.csv and -

- /path/to/training/data/negative.csv -

- Our actual data has colA,colB,colC,colD,colE with comma as the delimiter and - is saved at -

- /path/to/match/data.csv. -

- We want to match on colB and colD only, one of which is String and other is - int -

- Our program would look like - -

- {
- 	@code
- 	Arguments args = new Arguments();
- 	args.setDelimiter(",");
- 	args.setPositiveTrainingSamples("/path/to/training/data/positive.csv");
- 	args.setNegativeTrainingSamples("/path/to/training/data/negative.csv");
- 
- 	FieldDefinition colB = new FieldDefinition(1, FieldClass.STRING,
- 			FieldType.WORD);
- 	FieldDefinition colD = new FieldDefinition(3, FieldClass.INTEGER,
- 			FieldType.NUMERIC);
- 
- 	List<FieldDefinition> fields = new ArrayList<FieldDefinition>();
- 	fields.add(colB);
- 	fields.add(colD);
- 	args.setFieldDefinition(fields);
- 
- 	args.setMatchData("/path/to/match/data.csv");
- 
- 	args.setZinggDir("/path/to/models");
- 	args.setOutputDir("/path/to/match/output");
- 
- 	Client client = new Client(args, "local");
- 	client.train();
- 	client.run();
- }
- 
-''' class Arguments: def __init__(self): self.arguments = getJVM().zingg.common.client.Arguments() @@ -66,98 +6,33 @@ def __init__(self): def setNumPartitions(self, numPartitions): self.arguments.setNumPartitions(numPartitions) - ''' -Set the fraction of data to be used from complete data set to be used for - seeding the labelled data Labelling is costly and we want a fast - approximate way of looking at a small sample of the records and - identifying expected matches and non matches - - @param labelDataSampleSize - - float between 0 and 1 denoting portion of dataset to use in - generating seed samples - @throws ZinggClientException - ''' def setLabelDataSampleSize(self, labelDataSampleSize): self.arguments.setLabelDataSampleSize(labelDataSampleSize) - ''' -Location for internal Zingg use. - - @return the path for internal Zingg usage - - public Pipe[] getZinggInternal() { - return zinggInternal; - } - - /** - Set the location for Zingg to save its internal computations and - models. Please set it to a place where the program has write access. - - @param zinggDir - path to the Zingg directory - - public void setZinggInternal(Pipe[] zinggDir) { - this.zinggInternal = zinggDir; - } - ''' def getModelId(self): return self.arguments.getModelId() def setModelId(self, modelId): self.arguments.setModelId(modelId) - ''' -Set the output directory where the match result will be saved - - @param outputDir - where the match result is saved - @throws ZinggClientException - ''' def setOutput(self, outputDir): self.arguments.setOutput(outputDir) - ''' -Set the location for Zingg to save its internal computations and - models. Please set it to a place where the program has write access. - - @param zinggDir - path to the Zingg directory - ''' def setZinggDir(self, zinggDir): self.arguments.setZinggDir(zinggDir) - ''' -Location for internal Zingg use. - - @return the path for internal Zingg usage - ''' def getZinggBaseModelDir(self): return self.arguments.getZinggBaseModelDir() def getZinggModelDir(self): return self.arguments.getZinggModelDir() - ''' -Location for internal Zingg use. - - @return the path for internal Zingg usage - ''' def getZinggBaseTrainingDataDir(self): return self.arguments.getZinggBaseTrainingDataDir() - ''' -Location for internal Zingg use. - - @return the path for internal Zingg usage - ''' def getZinggTrainingDataUnmarkedDir(self): return self.arguments.getZinggTrainingDataUnmarkedDir() - ''' -Location for internal Zingg use. - - @return the path for internal Zingg usage - ''' def getZinggTrainingDataMarkedDir(self): return self.arguments.getZinggTrainingDataMarkedDir() diff --git a/python/zingg/FieldDefinitionGenerated.py b/python/zingg/FieldDefinitionGenerated.py index b08d75984..4713552bd 100644 --- a/python/zingg/FieldDefinitionGenerated.py +++ b/python/zingg/FieldDefinitionGenerated.py @@ -1,10 +1,4 @@ from zingg.otherThanGenerated import * -''' -This class defines each field that we use in matching We can use this to - configure the properties of each field we use for matching in Zingg. - - @author sgoyal -''' class FieldDefinition: def __init__(self, name, dataType, *matchType): self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() @@ -19,13 +13,6 @@ def getFieldDefinition(self): def setFields(self, fields): self.fielddefinition.setFields(fields) - ''' -Set the field type which defines the kind of matching we want to do - - @see MatchType - @param type - the type to set - ''' def setMatchType(self, type): self.fielddefinition.setMatchType(type) diff --git a/python/zingg/PipeGenerated.py b/python/zingg/PipeGenerated.py index 326404a11..6144a7177 100644 --- a/python/zingg/PipeGenerated.py +++ b/python/zingg/PipeGenerated.py @@ -6,11 +6,6 @@ FilePipe = getJVM().zingg.common.client.pipe.FilePipe JStructType = getJVM().org.apache.spark.sql.types.StructType -''' -Actual pipe def in the args. One pipe can be used at multiple places with different tables, locations, queries etc - - @author sgoyal -''' class Pipe: def __init__(self, name, format): self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe() From 6d87021209328a1539be27f5db8645b4727779dd Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Thu, 7 Mar 2024 13:28:28 +0530 Subject: [PATCH 098/219] Revert "Merge pull request #774 from gnanaprakash-ravi/ZEIssue230" This reverts commit 7d9567c89a1f9e0006abbed8ea6884e4d6138512, reversing changes made to f9e9528dccfdc3148583a9c0585dd435e57e4249. --- .../java/zingg/common/client/Arguments.java | 22 +- .../zingg/common/client/FieldDefinition.java | 10 +- .../py/processors/PythonClassProcessor.java | 87 +-- .../py/processors/PythonMethodProcessor.java | 30 +- examples/febrl/GeneratedFebrlExample.py | 46 -- python/zingg/ArgumentsGenerated.py | 44 -- python/zingg/FieldDefinitionGenerated.py | 24 - python/zingg/PipeGenerated.py | 35 -- python/zingg/otherThanGenerated.py | 510 ------------------ python/zingg/otherThanGeneratedArguments.py | 56 -- .../otherThanGeneratedFieldDefinition.py | 20 - python/zingg/otherThanGeneratedPipe.py | 228 -------- 12 files changed, 68 insertions(+), 1044 deletions(-) delete mode 100644 examples/febrl/GeneratedFebrlExample.py delete mode 100644 python/zingg/ArgumentsGenerated.py delete mode 100644 python/zingg/FieldDefinitionGenerated.py delete mode 100644 python/zingg/PipeGenerated.py delete mode 100644 python/zingg/otherThanGenerated.py delete mode 100644 python/zingg/otherThanGeneratedArguments.py delete mode 100644 python/zingg/otherThanGeneratedFieldDefinition.py delete mode 100644 python/zingg/otherThanGeneratedPipe.py diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index 5116a5cd9..3f396f090 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -17,8 +17,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import zingg.common.client.pipe.Pipe; -import zingg.common.py.annotations.PythonClass; -import zingg.common.py.annotations.PythonMethod; /** @@ -81,7 +79,6 @@ * } * */ -@PythonClass @JsonInclude(Include.NON_NULL) public class Arguments implements Serializable, IArguments { @@ -124,7 +121,7 @@ public Arguments() { public int getNumPartitions() { return numPartitions; } - @PythonMethod + @Override public void setNumPartitions(int numPartitions) throws ZinggClientException{ if (numPartitions != -1 && numPartitions <= 0) @@ -157,7 +154,6 @@ public float getLabelDataSampleSize() { * generating seed samples * @throws ZinggClientException */ - @PythonMethod @Override public void setLabelDataSampleSize(float labelDataSampleSize) throws ZinggClientException { if (labelDataSampleSize > 1 || labelDataSampleSize < 0) @@ -239,12 +235,12 @@ public void setZinggInternal(Pipe[] zinggDir) { */ - @PythonMethod + @Override public String getModelId() { return modelId; } - @PythonMethod + @Override public void setModelId(String modelId) { this.modelId = modelId; @@ -267,7 +263,6 @@ public Pipe[] getOutput() { * where the match result is saved * @throws ZinggClientException */ - @PythonMethod @Override public void setOutput(Pipe[] outputDir) throws ZinggClientException { //checkNullBlankEmpty(outputDir, " path for saving results"); @@ -345,7 +340,6 @@ public String getZinggDir() { * @param zinggDir * path to the Zingg directory */ - @PythonMethod @Override public void setZinggDir(String zinggDir) { this.zinggDir = zinggDir; @@ -357,13 +351,12 @@ public void setZinggDir(String zinggDir) { * * @return the path for internal Zingg usage */ - @PythonMethod + @Override @JsonIgnore public String getZinggBaseModelDir(){ return zinggDir + "/" + modelId; } - @PythonMethod @Override @JsonIgnore public String getZinggModelDir() { @@ -393,7 +386,6 @@ public String getZinggDataDocFile() { * * @return the path for internal Zingg usage */ - @PythonMethod @Override @JsonIgnore public String getZinggBaseTrainingDataDir() { @@ -407,7 +399,6 @@ public String getZinggBaseTrainingDataDir() { * * @return the path for internal Zingg usage */ - @PythonMethod @Override @JsonIgnore public String getZinggTrainingDataUnmarkedDir() { @@ -419,7 +410,6 @@ public String getZinggTrainingDataUnmarkedDir() { * * @return the path for internal Zingg usage */ - @PythonMethod @Override @JsonIgnore public String getZinggTrainingDataMarkedDir() { @@ -488,7 +478,7 @@ public void setCollectMetrics(boolean collectMetrics) { public float getStopWordsCutoff() { return stopWordsCutoff; } - @PythonMethod + @Override public void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException { if (stopWordsCutoff > 1 || stopWordsCutoff < 0) @@ -510,7 +500,7 @@ public void setShowConcise(boolean showConcise) { public String getColumn() { return column; } - @PythonMethod + @Override public void setColumn(String column) { this.column = column; diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index f162a9107..0e3ad4d99 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -22,10 +22,6 @@ import com.fasterxml.jackson.databind.deser.std.StdDeserializer; import com.fasterxml.jackson.databind.ser.std.StdSerializer; -import zingg.common.client.cols.Named; -import zingg.common.py.annotations.PythonClass; -import zingg.common.py.annotations.PythonMethod; - /** * This class defines each field that we use in matching We can use this to @@ -56,7 +52,7 @@ public FieldDefinition() { } public String getFields() { return fields; } - @PythonMethod + public void setFields(String fields) { this.fields = fields;} /** @@ -75,7 +71,6 @@ public List getMatchType() { * @param type * the type to set */ - @PythonMethod @JsonDeserialize(using = MatchTypeDeserializer.class) public void setMatchType(List type) { this.matchType = type; //MatchTypeDeserializer.getMatchTypeFromString(type); @@ -103,7 +98,7 @@ public void setDataType(String d) { public String getStopWords() { return stopWords; } - @PythonMethod + public void setStopWords(String stopWords) { this.stopWords = stopWords; } @@ -120,7 +115,6 @@ public String getFieldName() { return fieldName; } - @PythonMethod public void setFieldName(String fieldName) { this.fieldName = fieldName; } diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 879c6feb4..0efd7bf25 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -1,8 +1,5 @@ package zingg.common.py.processors; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -20,6 +17,7 @@ @SupportedAnnotationTypes("zingg.common.py.annotations.PythonClass") public class PythonClassProcessor extends AbstractProcessor { + private boolean importsAndDeclarationsGenerated = false; private Map> classMethodsMap = new HashMap<>(); @Override @@ -30,6 +28,12 @@ public synchronized void init(ProcessingEnvironment processingEnv) { @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { + // Imports and global declarations + if (!importsAndDeclarationsGenerated) { + generateImportsAndDeclarations(); + importsAndDeclarationsGenerated = true; + } + // process Services annotation for (Element element : roundEnv.getElementsAnnotatedWith(PythonClass.class)) { @@ -44,22 +48,28 @@ public boolean process(Set annotations, RoundEnvironment try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) { generateImportsAndDeclarations(element, fileWriter); - fileWriter.write("class " + element.getSimpleName() + ":\n"); + System.out.println("class " + element.getSimpleName() + ":"); + + // __init__ method + System.out.println(" def __init__(self" + + generateConstructorParameters(classElement) + "):"); + generateClassInitializationCode(classElement, element); - // __init__ method - fileWriter.write(" def __init__(self" + generateConstructorParameters(classElement, element) + "):\n"); - generateClassInitializationCode(classElement, element, fileWriter); + // for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { + // if (!field.getSimpleName().contentEquals("serialVersionUID")) { + // generateFieldInitializationCode(field, element); + // } + // } - for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { - if (methodElement.getAnnotation(PythonMethod.class) != null) { - methodNames.add(methodElement.getSimpleName().toString()); - } + for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { + if (methodElement.getAnnotation(PythonMethod.class) != null) { + methodNames.add(methodElement.getSimpleName().toString()); } - classMethodsMap.put(element.getSimpleName().toString(), methodNames); - } catch (IOException e) { - e.printStackTrace(); } + classMethodsMap.put(element.getSimpleName().toString(), methodNames); } + System.out.println(); + // rest of generated class contents } ProcessorContext processorContext = ProcessorContext.getInstance(); processorContext.getClassMethodsMap().putAll(classMethodsMap); @@ -94,13 +104,20 @@ private void generateImportsAndDeclarations(Element element, FileWriter fileWrit fileWriter.write("JStructType = getJVM().org.apache.spark.sql.types.StructType\n"); fileWriter.write("\n"); } + private void generateImportsAndDeclarations() { + System.out.println("import logging"); + System.out.println("from zingg.client import *"); + System.out.println("LOG = logging.getLogger(\"zingg.pipes\")"); + System.out.println(); + System.out.println("JPipe = getJVM().zingg.spark.client.pipe.SparkPipe"); + System.out.println("FilePipe = getJVM().zingg.common.client.pipe.FilePipe"); + System.out.println("JStructType = getJVM().org.apache.spark.sql.types.StructType"); + System.out.println(); } - private void generateClassInitializationCode(TypeElement classElement, Element element, FileWriter fileWriter) throws IOException { + private void generateClassInitializationCode(TypeElement classElement, Element element) { if (element.getSimpleName().contentEquals("Pipe")) { - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()\n"); - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setName(name)\n"); - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFormat(format)\n"); + System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); } else if (element.getSimpleName().contentEquals("EPipe")) { fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()\n"); @@ -134,32 +151,18 @@ else if (element.getSimpleName().contentEquals("FieldDefinition")) { // } // } - private String generateConstructorParameters(TypeElement classElement, Element element) { - + private String generateConstructorParameters(TypeElement classElement) { StringBuilder parameters = new StringBuilder(); + List fields = ElementFilter.fieldsIn(classElement.getEnclosedElements()); - if (element.getSimpleName().contentEquals("Arguments")) { - // For the "Arguments" class, no constructor parameters are needed - return ""; - } - else if (element.getSimpleName().contentEquals("Pipe")) { - parameters.append(", name, format"); - } - else if (element.getSimpleName().contentEquals("FieldDefinition")) { - parameters.append(", name, dataType, *matchType"); - } - else { - List fields = ElementFilter.fieldsIn(classElement.getEnclosedElements()); - - fields = fields.stream() - .filter(field -> !field.getSimpleName().contentEquals("serialVersionUID")) - .filter(this::isFieldForConstructor) - .collect(Collectors.toList()); - - for (VariableElement field : fields) { - parameters.append(", "); - parameters.append(field.getSimpleName()); - } + fields = fields.stream() + .filter(field -> !field.getSimpleName().contentEquals("serialVersionUID")) + .filter(this::isFieldForConstructor) + .collect(Collectors.toList()); + + for (VariableElement field : fields) { + parameters.append(", "); + parameters.append(field.getSimpleName()); } return parameters.toString(); } diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 6de712703..183b6458d 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -1,8 +1,5 @@ package zingg.common.py.processors; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; import java.util.List; import java.util.Map; @@ -10,6 +7,7 @@ import javax.lang.model.type.TypeMirror; import javax.lang.model.type.TypeKind; import java.util.Set; +// import java.util.logging.Logger; import javax.lang.model.element.*; import zingg.common.py.annotations.*; @@ -18,13 +16,16 @@ public class PythonMethodProcessor extends AbstractProcessor { private Map> classMethodsMap; + // private static final Logger LOG = Logger.getLogger(PythonMethodProcessor.class.getName()); @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { ProcessorContext processorContext = ProcessorContext.getInstance(); classMethodsMap = processorContext.getClassMethodsMap(); + // LOG.info("Processing PythonMethod annotations..."); + // process Services annotation for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { if (element.getKind() == ElementKind.METHOD) { @@ -35,18 +36,17 @@ public boolean process(Set annotations, RoundEnvironment List methodNames = classMethodsMap.get(className); if (methodNames.contains(methodElement.getSimpleName().toString())) { - try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) { - fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n"); - generateMethodReturn(methodElement, fileWriter); - generateFieldAssignment(methodElement, fileWriter); - fileWriter.write("\n"); - } catch (IOException e) { - e.printStackTrace(); - } + // LOG.info("Generating Python method for: " + methodElement.getSimpleName()); + System.out.println(" def " + methodElement.getSimpleName() + "(self" + + generateMethodSignature(methodElement) + "):"); + generateMethodReturn(methodElement); + generateFieldAssignment(methodElement); } } } + System.out.println(); } + // LOG.info("Processing complete."); return false; } @@ -65,7 +65,7 @@ private String generateMethodParameters(ExecutableElement methodElement) { return parameters.toString(); } - private void generateMethodReturn(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { + private void generateMethodReturn(ExecutableElement methodElement) { TypeMirror returnType = methodElement.getReturnType(); if (returnType.getKind() == TypeKind.VOID) { return; @@ -73,7 +73,7 @@ private void generateMethodReturn(ExecutableElement methodElement, FileWriter fi String returnTypeString = resolveType(returnType); String methodName = methodElement.getSimpleName().toString(); String className = methodElement.getEnclosingElement().getSimpleName().toString(); - fileWriter.write(" return self." + className.toLowerCase() + "." + methodName + "()\n"); + System.out.println(" return self." + className.toLowerCase() + "." + methodName + "()"); } } @@ -81,7 +81,7 @@ private String resolveType(TypeMirror typeMirror) { return typeMirror.toString(); } - private void generateFieldAssignment(ExecutableElement methodElement, FileWriter fileWriter) throws IOException { + private void generateFieldAssignment(ExecutableElement methodElement) { List parameters = methodElement.getParameters(); if (!parameters.isEmpty()) { @@ -95,7 +95,7 @@ private void generateFieldAssignment(ExecutableElement methodElement, FileWriter } parameterList.append(parameter.getSimpleName()); } - fileWriter.write(" self." + className.toLowerCase() + "." + methodName + "(" + parameterList + ")\n"); + System.out.println(" self." + className.toLowerCase() + "." + methodName + "(" + parameterList + ")"); } } diff --git a/examples/febrl/GeneratedFebrlExample.py b/examples/febrl/GeneratedFebrlExample.py deleted file mode 100644 index 54c64e77e..000000000 --- a/examples/febrl/GeneratedFebrlExample.py +++ /dev/null @@ -1,46 +0,0 @@ -from zingg.ArgumentsGenerated import * -from zingg.FieldDefinitionGenerated import * -from zingg.PipeGenerated import * -from zingg.otherThanGenerated import * -from zingg.otherThanGeneratedPipe import * -from zingg.otherThanGeneratedArguments import * -from zingg.otherThanGeneratedFieldDefinition import * - -#build the arguments for zingg -args = ExtendedArgumentsGenerated() -#set field definitions -fname = ExtendedFieldDefinitionGenerated("fname", "string", MatchType.FUZZY) -lname = ExtendedFieldDefinitionGenerated("lname", "string", MatchType.FUZZY) -stNo = ExtendedFieldDefinitionGenerated("stNo", "string", MatchType.FUZZY) -add1 = ExtendedFieldDefinitionGenerated("add1","string", MatchType.FUZZY) -add2 = ExtendedFieldDefinitionGenerated("add2", "string", MatchType.FUZZY) -city = ExtendedFieldDefinitionGenerated("city", "string", MatchType.FUZZY) -areacode = ExtendedFieldDefinitionGenerated("areacode", "string", MatchType.FUZZY) -state = ExtendedFieldDefinitionGenerated("state", "string", MatchType.FUZZY) -dob = ExtendedFieldDefinitionGenerated("dob", "string", MatchType.FUZZY) -ssn = ExtendedFieldDefinitionGenerated("ssn", "string", MatchType.FUZZY) - -fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn] - -args.setFieldDefinition(fieldDefs) -#set the modelid and the zingg dir -args.setModelId("0102") -args.setZinggDir("models") -args.setNumPartitions(4) -args.setLabelDataSampleSize(0.5) - -#reading dataset into inputPipe and settint it up in 'args' -#below line should not be required if you are reading from in memory dataset -#in that case, replace df with input df -schema = "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" -inputPipe = CsvPipe("testFebrl", "examples/febrl/test.csv", schema) -args.setData(inputPipe) -outputPipe = CsvPipe("resultFebrl", "/tmp/febrlOutput") - -args.setOutput(outputPipe) - -options = ClientOptions([ClientOptions.PHASE,"findTrainingData"]) - -#Zingg execution for the given phase -zingg = Zingg(args, options) -zingg.initAndExecute() \ No newline at end of file diff --git a/python/zingg/ArgumentsGenerated.py b/python/zingg/ArgumentsGenerated.py deleted file mode 100644 index e03fc961c..000000000 --- a/python/zingg/ArgumentsGenerated.py +++ /dev/null @@ -1,44 +0,0 @@ -from zingg.otherThanGenerated import * -class Arguments: - def __init__(self): - self.arguments = getJVM().zingg.common.client.Arguments() - - def setNumPartitions(self, numPartitions): - self.arguments.setNumPartitions(numPartitions) - - def setLabelDataSampleSize(self, labelDataSampleSize): - self.arguments.setLabelDataSampleSize(labelDataSampleSize) - - def getModelId(self): - return self.arguments.getModelId() - - def setModelId(self, modelId): - self.arguments.setModelId(modelId) - - def setOutput(self, outputDir): - self.arguments.setOutput(outputDir) - - def setZinggDir(self, zinggDir): - self.arguments.setZinggDir(zinggDir) - - def getZinggBaseModelDir(self): - return self.arguments.getZinggBaseModelDir() - - def getZinggModelDir(self): - return self.arguments.getZinggModelDir() - - def getZinggBaseTrainingDataDir(self): - return self.arguments.getZinggBaseTrainingDataDir() - - def getZinggTrainingDataUnmarkedDir(self): - return self.arguments.getZinggTrainingDataUnmarkedDir() - - def getZinggTrainingDataMarkedDir(self): - return self.arguments.getZinggTrainingDataMarkedDir() - - def setStopWordsCutoff(self, stopWordsCutoff): - self.arguments.setStopWordsCutoff(stopWordsCutoff) - - def setColumn(self, column): - self.arguments.setColumn(column) - diff --git a/python/zingg/FieldDefinitionGenerated.py b/python/zingg/FieldDefinitionGenerated.py deleted file mode 100644 index 4713552bd..000000000 --- a/python/zingg/FieldDefinitionGenerated.py +++ /dev/null @@ -1,24 +0,0 @@ -from zingg.otherThanGenerated import * -class FieldDefinition: - def __init__(self, name, dataType, *matchType): - self.fielddefinition = getJVM().zingg.common.client.FieldDefinition() - self.fielddefinition.setFieldName(name) - self.fielddefinition.setDataType(self.stringify(dataType)) - self.fielddefinition.setMatchType(matchType) - self.fielddefinition.setFields(name) - - def getFieldDefinition(self): - return self.fielddefinition - - def setFields(self, fields): - self.fielddefinition.setFields(fields) - - def setMatchType(self, type): - self.fielddefinition.setMatchType(type) - - def setStopWords(self, stopWords): - self.fielddefinition.setStopWords(stopWords) - - def setFieldName(self, fieldName): - self.fielddefinition.setFieldName(fieldName) - diff --git a/python/zingg/PipeGenerated.py b/python/zingg/PipeGenerated.py deleted file mode 100644 index 6144a7177..000000000 --- a/python/zingg/PipeGenerated.py +++ /dev/null @@ -1,35 +0,0 @@ -from zingg.otherThanGenerated import * -import logging -LOG = logging.getLogger("zingg.pipes") - -JPipe = getJVM().zingg.spark.client.pipe.SparkPipe -FilePipe = getJVM().zingg.common.client.pipe.FilePipe -JStructType = getJVM().org.apache.spark.sql.types.StructType - -class Pipe: - def __init__(self, name, format): - self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe() - self.pipe.setName(name) - self.pipe.setFormat(format) - - def setSchema(self, schema): - self.pipe.setSchema(schema) - - def getName(self): - return self.pipe.getName() - - def setName(self, name): - self.pipe.setName(name) - - def getFormat(self): - return self.pipe.getFormat() - - def setFormat(self, sinkType): - self.pipe.setFormat(sinkType) - - def setProp(self, k, v): - self.pipe.setProp(k, v) - - def toString(self): - return self.pipe.toString() - diff --git a/python/zingg/otherThanGenerated.py b/python/zingg/otherThanGenerated.py deleted file mode 100644 index af1f1c710..000000000 --- a/python/zingg/otherThanGenerated.py +++ /dev/null @@ -1,510 +0,0 @@ -""" -zingg.client ------------------------- -This module is the main entry point of the Zingg Python API -""" - -import logging -import argparse -import pandas as pd -from pyspark.sql import DataFrame - -from pyspark import SparkConf, SparkContext, SQLContext - -from py4j.java_collections import SetConverter, MapConverter, ListConverter - -from pyspark.sql import SparkSession -import os - -LOG = logging.getLogger("zingg") - -_spark_ctxt = None -_sqlContext = None -_spark = None -_zingg_jar = 'zingg-0.4.1-SNAPSHOT.jar' - -def initSparkClient(): - global _spark_ctxt - global _sqlContext - global _spark - _spark_ctxt = SparkContext.getOrCreate() - _sqlContext = SQLContext(_spark_ctxt) - _spark = SparkSession.builder.getOrCreate() - return 1 - -def initDataBricksConectClient(): - global _spark_ctxt - global _sqlContext - global _spark - jar_path = os.getenv('ZINGG_HOME')+'/'+_zingg_jar - _spark = SparkSession.builder.config('spark.jars', jar_path).getOrCreate() - _spark_ctxt = _spark.sparkContext - _sqlContext = SQLContext(_spark_ctxt) - return 1 - -def initClient(): - global _spark_ctxt - global _sqlContext - global _spark - if _spark_ctxt is None: - DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT') - if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y': - return initDataBricksConectClient() - else: - return initSparkClient() - else: - return 1 - -def getSparkContext(): - if _spark_ctxt is None: - initClient() - return _spark_ctxt - -def getSparkSession(): - if _spark is None: - initClient() - return _spark - -def getSqlContext(): - if _sqlContext is None: - initClient() - return _sqlContext - -def getJVM(): - return getSparkContext()._jvm - -def getGateway(): - return getSparkContext()._gateway - -ColName = getJVM().zingg.common.client.util.ColName -MatchType = getJVM().zingg.common.client.MatchType -ClientOptions = getJVM().zingg.common.client.ClientOptions -ZinggOptions = getJVM().zingg.common.client.ZinggOptions -LabelMatchType = getJVM().zingg.common.core.util.LabelMatchType -UpdateLabelMode = 'Overwrite' - -def getDfFromDs(data): - """ Method to convert spark dataset to dataframe - - :param data: provide spark dataset - :type data: DataSet - :return: converted spark dataframe - :rtype: DataFrame - """ - return DataFrame(data.df(), getSqlContext()) - -def getPandasDfFromDs(data): - """ Method to convert spark dataset to pandas dataframe - - :param data: provide spark dataset - :type data: DataSet - :return: converted pandas dataframe - :rtype: DataFrame - """ - df = getDfFromDs(data) - return pd.DataFrame(df.collect(), columns=df.columns) - -class Zingg: - """ This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. - - :param args: arguments for training and matching - :type args: Arguments - :param options: client option for this class object - :type options: ClientOptions - - """ - - def __init__(self, args, options): - self.inpArgs = args - self.inpOptions = options - self.client = getJVM().zingg.spark.client.SparkClient(args.getArgs(), options.getClientOptions()) - - def init(self): - """ Method to initialize zingg client by reading internal configurations and functions """ - self.client.init() - - def execute(self): - """ Method to execute this class object """ - self.client.execute() - - def initAndExecute(self): - """ Method to run both init and execute methods consecutively """ - self.client.init() - DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT') - if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y': - options = self.client.getOptions() - inpPhase = options.get(ClientOptions.PHASE).getValue() - if (inpPhase==ZinggOptions.LABEL.getValue()): - self.executeLabel() - elif (inpPhase==ZinggOptions.UPDATE_LABEL.getValue()): - self.executeLabelUpdate() - else: - self.client.execute() - else: - self.client.execute() - - def executeLabel(self): - """ Method to run label phase """ - self.client.getTrainingDataModel().setMarkedRecordsStat(self.getMarkedRecords()) - unmarkedRecords = self.getUnmarkedRecords() - updatedRecords = self.processRecordsCli(unmarkedRecords,self.inpArgs) - self.writeLabelledOutput(updatedRecords,self.inpArgs) - - def executeLabelUpdate(self): - """ Method to run label update phase """ - self.processRecordsCliLabelUpdate(self.getMarkedRecords(),self.inpArgs) - - def getMarkedRecords(self): - """ Method to get marked record dataset from the inputpipe - - :return: spark dataset containing marked records - :rtype: Dataset - """ - return self.client.getMarkedRecords() - - def getUnmarkedRecords(self): - """ Method to get unmarked record dataset from the inputpipe - - :return: spark dataset containing unmarked records - :rtype: Dataset - """ - return self.client.getUnmarkedRecords() - - def processRecordsCli(self,unmarkedRecords,args): - """ Method to get user input on unmarked records - - :return: spark dataset containing updated records - :rtype: Dataset - """ - trainingDataModel = self.client.getTrainingDataModel() - labelDataViewHelper = self.client.getLabelDataViewHelper() - - if unmarkedRecords is not None and unmarkedRecords.count() > 0: - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) - unmarkedRecords = unmarkedRecords.cache() - displayCols = labelDataViewHelper.getDisplayColumns(unmarkedRecords, args.getArgs()) - clusterIdZFrame = labelDataViewHelper.getClusterIdsFrame(unmarkedRecords) - clusterIDs = labelDataViewHelper.getClusterIds(clusterIdZFrame) - totalPairs = clusterIDs.size() - updatedRecords = None - for index in range(totalPairs): - currentPair = labelDataViewHelper.getCurrentPair(unmarkedRecords, index, clusterIDs, clusterIdZFrame) - - score = labelDataViewHelper.getScore(currentPair) - prediction = labelDataViewHelper.getPrediction(currentPair) - - msg1 = labelDataViewHelper.getMsg1(index, totalPairs) - msg2 = labelDataViewHelper.getMsg2(prediction, score) - labelDataViewHelper.displayRecords(labelDataViewHelper.getDSUtil().select(currentPair, displayCols), msg1, msg2) - selected_option = input() - while int(selected_option) not in [0,1,2,9]: - print('Please enter valid option') - selected_option = input("Enter choice: ") - if int(selected_option) == 9: - print("User has quit in the middle. Updating the records.") - break - trainingDataModel.updateLabellerStat(int(selected_option), 1) - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) - updatedRecords = trainingDataModel.updateRecords(int(selected_option), currentPair, updatedRecords) - print("Processing finished.") - return updatedRecords - else: - print("It seems there are no unmarked records at this moment. Please run findTrainingData job to build some pairs to be labelled and then run this labeler.") - return None - - def processRecordsCliLabelUpdate(self,lines,args): - trainingDataModel = self.client.getTrainingDataModel() - labelDataViewHelper = self.client.getLabelDataViewHelper() - if (lines is not None and lines.count() > 0): - trainingDataModel.setMarkedRecordsStat(lines) - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) - displayCols = labelDataViewHelper.getDSUtil().getFieldDefColumns(lines, args.getArgs(), False, args.getArgs().getShowConcise()) - updatedRecords = None - recordsToUpdate = lines - selectedOption = -1 - - while (str(selectedOption) != '9'): - cluster_id = input("\n\tPlease enter the cluster id (or 9 to exit): ") - if str(cluster_id) == '9': - print("User has exit in the middle. Updating the records.") - break - currentPair = lines.filter(lines.equalTo(ColName.CLUSTER_COLUMN, cluster_id)) - if currentPair.isEmpty(): - print("\tInvalid cluster id. Enter '9' to exit") - continue - - matchFlag = currentPair.getAsInt(currentPair.head(),ColName.MATCH_FLAG_COL) - preMsg = "\n\tThe record pairs belonging to the input cluster id "+cluster_id+" are:" - postMsg = "\tThe above pair is labeled as "+str(matchFlag)+"\n" - labelDataViewHelper.displayRecords(labelDataViewHelper.getDSUtil().select(currentPair, displayCols), preMsg, postMsg) - selectedOption = input() - trainingDataModel.updateLabellerStat(int(selectedOption), 1) - trainingDataModel.updateLabellerStat(matchFlag, -1) - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) - - if (str(selectedOption) == '9'): - print("User has quit in the middle. Updating the records.") - break - - recordsToUpdate = recordsToUpdate.filter(recordsToUpdate.notEqual(ColName.CLUSTER_COLUMN,cluster_id)) - - if (updatedRecords is not None): - updatedRecords = updatedRecords.filter(updatedRecords.notEqual(ColName.CLUSTER_COLUMN,cluster_id)) - - updatedRecords = trainingDataModel.updateRecords(int(selectedOption), currentPair, updatedRecords) - - if updatedRecords is not None: - updatedRecords = updatedRecords.union(recordsToUpdate) - - outPipe = trainingDataModel.getOutputPipe(args.getArgs()) - outPipe.setMode(UpdateLabelMode) - - trainingDataModel.writeLabelledOutput(updatedRecords,args.getArgs(),outPipe) - print("Processing finished.") - return updatedRecords - else: - print("There is no marked record for updating. Please run findTrainingData/label jobs to generate training data.") - return None - - - def writeLabelledOutput(self,updatedRecords,args): - """ Method to write updated records after user input - """ - trainingDataModel = self.client.getTrainingDataModel() - if updatedRecords is not None: - trainingDataModel.writeLabelledOutput(updatedRecords,args.getArgs()) - - def writeLabelledOutputFromPandas(self,candidate_pairs_pd,args): - """ Method to write updated records (as pandas df) after user input - """ - markedRecordsAsDS = (getSparkSession().createDataFrame(candidate_pairs_pd))._jdf - # pands df gives z_isMatch as long so needs to be cast - markedRecordsAsDS = markedRecordsAsDS.withColumn(ColName.MATCH_FLAG_COL,markedRecordsAsDS.col(ColName.MATCH_FLAG_COL).cast("int")) - updatedRecords = getJVM().zingg.spark.client.SparkFrame(markedRecordsAsDS) - self.writeLabelledOutput(updatedRecords,args) - - def setArguments(self, args): - """ Method to set Arguments - - :param args: provide arguments for this class object - :type args: Arguments - """ - self.client.setArguments() - - def getArguments(self): - """ Method to get atguments of this class object - - :return: The pointer containing address of the Arguments object of this class object - :rtype: pointer(Arguments) - """ - return self.client.getArguments() - - def getOptions(self): - """ Method to get client options of this class object - - :return: The pointer containing the address of the ClientOptions object of this class object - :rtype: pointer(ClientOptions) - """ - return self.client.getOptions() - - def setOptions(self, options): - """ Method to set atguments of this class object - - :param options: provide client options for this class object - :type options: ClientOptions - :return: The pointer containing address of the ClientOptions object of this class object - :rtype: pointer(ClientOptions) - """ - return self.client.setOptions(options) - - def getMarkedRecordsStat(self, markedRecords, value): - """ Method to get No. of records that is marked - - :param markedRecords: spark dataset containing marked records - :type markedRecords: Dataset - :param value: flag value to check if markedRecord is initially matched or not - :type value: long - :return: The no. of marked records - :rtype: int - """ - return self.client.getMarkedRecordsStat(markedRecords, value) - - def getMatchedMarkedRecordsStat(self): - """ Method to get No. of records that are marked and matched - - :return: The bo. of matched marked records - :rtype: int - """ - return self.client.getMatchedMarkedRecordsStat(self.getMarkedRecords()) - - def getUnmatchedMarkedRecordsStat(self): - """ Method to get No. of records that are marked and unmatched - - :return: The no. of unmatched marked records - :rtype: int - """ - return self.client.getUnmatchedMarkedRecordsStat(self.getMarkedRecords()) - - def getUnsureMarkedRecordsStat(self): - """ Method to get No. of records that are marked and Not Sure if its matched or not - - :return: The no. of Not Sure marked records - :rtype: int - """ - return self.client.getUnsureMarkedRecordsStat(self.getMarkedRecords()) - - - -class ZinggWithSpark(Zingg): - - """ This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. - - :param args: arguments for training and matching - :type args: Arguments - :param options: client option for this class object - :type options: ClientOptions - - """ - - def __init__(self, args, options): - self.client = getJVM().zingg.spark.client.SparkClient(args.getArgs(), options.getClientOptions(), getSparkSession()._jsparkSession) - -class ClientOptions: - """ Class that contains Client options for Zingg object - :param phase: trainMatch, train, match, link, findAndLabel, findTrainingData, recommend etc - :type phase: String - :param args: Parse a list of Zingg command line options parameter values e.g. "--location" etc. optional argument for initializing this class. - :type args: List(String) or None - """ - PHASE = getJVM().zingg.common.client.ClientOptions.PHASE - """:PHASE: phase parameter for this class""" - CONF = getJVM().zingg.common.client.ClientOptions.CONF - """:CONF: conf parameter for this class""" - LICENSE = getJVM().zingg.common.client.ClientOptions.LICENSE - """:LICENSE: license parameter for this class""" - EMAIL = getJVM().zingg.common.client.ClientOptions.EMAIL - """:EMAIL: e-mail parameter for this class""" - LOCATION = getJVM().zingg.common.client.ClientOptions.LOCATION - """:LOCATION: location parameter for this class""" - REMOTE = getJVM().zingg.common.client.ClientOptions.REMOTE - """:REMOTE: remote option used internally for running on Databricks""" - ZINGG_DIR = getJVM().zingg.common.client.ClientOptions.ZINGG_DIR - """:ZINGG_DIR: location where Zingg saves the model, training data etc""" - MODEL_ID = getJVM().zingg.common.client.ClientOptions.MODEL_ID - """:MODEL_ID: ZINGG_DIR/MODEL_ID is used to save the model""" - COLUMN = getJVM().zingg.common.client.ClientOptions.COLUMN - """:COLUMN: Column whose stop words are to be recommended through Zingg""" - - def __init__(self, argsSent=None): - print(argsSent) - if(argsSent == None): - args = [] - else: - args = argsSent.copy() - if (not (self.PHASE in args)): - args.append(self.PHASE) - args.append("peekModel") - if (not (self.LICENSE in args)): - args.append(self.LICENSE) - args.append("zinggLic.txt") - if (not (self.EMAIL in args)): - args.append(self.EMAIL) - args.append("zingg@zingg.ai") - if (not (self.CONF in args)): - args.append(self.CONF) - args.append("dummyConf.json") - print("arguments for client options are ", args) - self.co = getJVM().zingg.common.client.ClientOptions(args) - - - def getClientOptions(self): - """ Method to get pointer address of this class - - :return: The pointer containing address of the this class object - :rtype: pointer(ClientOptions) - """ - return self.co - - def getOptionValue(self, option): - """ Method to get value for the key option - - :param option: key to geting the value - :type option: String - :return: The value which is mapped for given key - :rtype: String - """ - return self.co.getOptionValue(option) - - def setOptionValue(self, option, value): - """ Method to map option key to the given value - - :param option: key that is mapped with value - :type option: String - :param value: value to be set for given key - :type value: String - """ - self.co.get(option).setValue(value) - - def getPhase(self): - """ Method to get PHASE value - - :return: The PHASE parameter value - :rtype: String - """ - return self.co.get(ClientOptions.PHASE).getValue() - - def setPhase(self, newValue): - """ Method to set PHASE value - - :param newValue: name of the phase - :type newValue: String - :return: The pointer containing address of the this class object after seting phase - :rtype: pointer(ClientOptions) - """ - self.co.get(ClientOptions.PHASE).setValue(newValue) - - def getConf(self): - """ Method to get CONF value - - :return: The CONF parameter value - :rtype: String - """ - return self.co.get(ClientOptions.CONF).getValue() - - def hasLocation(self): - """ Method to check if this class has LOCATION parameter set as None or not - - :return: The boolean value if LOCATION parameter is present or not - :rtype: Bool - """ - if(self.co.get(ClientOptions.LOCATION)==None): - return False - else: - return True - - def getLocation(self): - """ Method to get LOCATION value - - :return: The LOCATION parameter value - :rtype: String - """ - return self.co.get(ClientOptions.LOCATION).getValue() - -def parseArguments(argv): - """ This method is used for checking mandatory arguments and creating an arguments list from Command line arguments - - :param argv: Values that are passed during the calling of the program along with the calling statement. - :type argv: List - :return: a list containing necessary arguments to run any phase - :rtype: List - """ - parser = argparse.ArgumentParser(description='Zingg\'s python APIs') - mandatoryOptions = parser.add_argument_group('mandatory arguments') - mandatoryOptions.add_argument('--phase', required=True, - help='python phase e.g. assessModel') - mandatoryOptions.add_argument('--conf', required=True, - help='JSON configuration with data input output locations and field definitions') - - args, remaining_args = parser.parse_known_args(argv) - LOG.debug("args: ", args) - return args \ No newline at end of file diff --git a/python/zingg/otherThanGeneratedArguments.py b/python/zingg/otherThanGeneratedArguments.py deleted file mode 100644 index 113d08ead..000000000 --- a/python/zingg/otherThanGeneratedArguments.py +++ /dev/null @@ -1,56 +0,0 @@ -from zingg.ArgumentsGenerated import * -from zingg.otherThanGeneratedFieldDefinition import * - -class ExtendedArgumentsGenerated(Arguments): - def __init__(self): - super().__init__() - - def setFieldDefinition(self, fieldDef): - javaFieldDef = [] - for f in fieldDef: - javaFieldDef.append(f.getFieldDefinition()) - self.arguments.setFieldDefinition(javaFieldDef) - - def setData(self, *pipes): - dataPipe = getGateway().new_array(getJVM().zingg.common.client.pipe.Pipe, len(pipes)) - for idx, pipe in enumerate(pipes): - dataPipe[idx] = pipe.getPipe() - self.arguments.setData(dataPipe) - - def setOutput(self, *pipes): - outputPipe = getGateway().new_array(getJVM().zingg.common.client.pipe.Pipe, len(pipes)) - for idx, pipe in enumerate(pipes): - outputPipe[idx] = pipe.getPipe() - self.arguments.setOutput(outputPipe) - - def getArgs(self): - return self.arguments - - def setTrainingSamples(self, *pipes): - dataPipe = getGateway().new_array(getJVM().zingg.common.client.pipe.Pipe, len(pipes)) - for idx, pipe in enumerate(pipes): - dataPipe[idx] = pipe.getPipe() - self.arguments.setTrainingSamples(dataPipe) - - def writeArgumentsToJSON(self, fileName): - getJVM().zingg.common.client.ArgumentsUtil().writeArgumentsToJSON(fileName, self.arguments) - - @staticmethod - def createArgumentsFromJSON(fileName, phase): - obj = Arguments() - obj.arguments = getJVM().zingg.common.client.ArgumentsUtil().createArgumentsFromJSON(fileName, phase) - return obj - - def writeArgumentsToJSONString(self): - return getJVM().zingg.common.client.ArgumentsUtil().writeArgumentstoJSONString(self.arguments) - - @staticmethod - def createArgumentsFromJSONString(jsonArgs, phase): - obj = Arguments() - obj.arguments = getJVM().zingg.common.client.ArgumentsUtil().createArgumentsFromJSONString(jsonArgs, phase) - return obj - - def copyArgs(self, phase): - argsString = self.writeArgumentsToJSONString() - return self.createArgumentsFromJSONString(argsString, phase) - \ No newline at end of file diff --git a/python/zingg/otherThanGeneratedFieldDefinition.py b/python/zingg/otherThanGeneratedFieldDefinition.py deleted file mode 100644 index 43f3d229e..000000000 --- a/python/zingg/otherThanGeneratedFieldDefinition.py +++ /dev/null @@ -1,20 +0,0 @@ -from zingg.FieldDefinitionGenerated import * - -class ExtendedFieldDefinitionGenerated(FieldDefinition): - def __init__(self, name, dataType, *matchType): - super().__init__(name, dataType, *matchType) - - def getFieldDefinition(self): - return self.fielddefinition - - # should be stringify'ed before it is set in fd object - def stringify(self, str): - """ Method to stringify'ed the dataType before it is set in FieldDefinition object - - :param str: dataType of the FieldDefinition - :type str: String - :return: The stringify'ed value of the dataType - :rtype: String - """ - - return str \ No newline at end of file diff --git a/python/zingg/otherThanGeneratedPipe.py b/python/zingg/otherThanGeneratedPipe.py deleted file mode 100644 index a46df2794..000000000 --- a/python/zingg/otherThanGeneratedPipe.py +++ /dev/null @@ -1,228 +0,0 @@ -from zingg.PipeGenerated import * - -class ExtendedPipeGenerated(Pipe): - def __init__(self, name, format): - super().__init__(name, format) - - def getPipe(self): - return self.pipe - - def addProperty(self, name, value): - """ Method for adding different properties of pipe - - :param name: name of the property - :type name: String - :param value: value you want to set for the property - :type value: String - """ - self.pipe.setProp(name, value) - -class CsvPipe(ExtendedPipeGenerated): - """ Class CsvPipe: used for working with text files which uses a pipe symbol to separate units of text that belong in different columns. - - :param name: name of the pipe. - :type name: String - :param location: (optional) location from where we read data - :type location: String or None - :param schema: (optional) json schema for the pipe - :type schema: Schema or None - """ - def __init__(self, name, location = None, schema = None): - ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_CSV) - if(location != None): - ExtendedPipeGenerated.addProperty(self, FilePipe.LOCATION, location) - if(schema != None): - #df = spark.read.format(JPipe.FORMAT_CSV).schema(schema).load(location) - #s = JStructType.fromDDL(schema) - ExtendedPipeGenerated.setSchema(self, schema) - print("set schema ") - - def setDelimiter(self, delimiter): - """ This method is used to define delimiter of CsvPipe - - :param delimiter: a sequence of one or more characters for specifying the boundary between separate, independent regions in data streams - :type delimiter: String - """ - ExtendedPipeGenerated.addProperty(self, "delimiter", delimiter) - - - def setLocation(self, location): - """ Method to set location of pipe - - :param location: location from where we read data - :type location: String - """ - ExtendedPipeGenerated.addProperty(self, FilePipe.LOCATION, location) - - def setHeader(self, header): - """ Method to set header property of pipe - - :param header: true if pipe have header, false otherwise - :type header: Boolean - """ - ExtendedPipeGenerated.addProperty(self, FilePipe.HEADER, header) - -class BigQueryPipe(ExtendedPipeGenerated): - """ Pipe Class for working with BigQuery pipeline - - :param name: name of the pipe. - :type name: String - """ - - VIEWS_ENABLED = "viewsEnabled" - CREDENTIAL_FILE = "credentialsFile" - TABLE = "table" - TEMP_GCS_BUCKET="temporaryGcsBucket" - - def __init__(self,name): - ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_BIGQUERY) - - def setCredentialFile(self, file): - """ Method to set Credential file to the pipe - - :param file: credential file name - :type file: String - """ - ExtendedPipeGenerated.addProperty(self, "credentialsFile", file) - - def setTable(self, table): - """ Method to set Table to the pipe - - :param table: provide table parameter - :type table: String - """ - ExtendedPipeGenerated.addProperty(self, "table", table) - - def setTemporaryGcsBucket(self, bucket): - """ Method to set TemporaryGcsBucket to the pipe - - :param bucket: provide bucket parameter - :type bucket: String - """ - ExtendedPipeGenerated.addProperty(self, "temporaryGcsBucket", bucket) - - def setViewsEnabled(self, isEnabled): - """ Method to set if viewsEnabled parameter is Enabled or not - - :param isEnabled: provide boolean parameter which defines if viewsEnabled option is enable or not - :type isEnabled: Bool - """ - ExtendedPipeGenerated.addProperty(self, "viewsEnabled", isEnabled) - - -class SnowflakePipe(ExtendedPipeGenerated): - """ Pipe Class for working with Snowflake pipeline - - :param name: name of the pipe - :type name: String - """ - URL = "sfUrl" - USER = "sfUser" - PASSWORD = "sfPassword" - DATABASE ="sfDatabase" - SCHEMA = "sfSchema" - WAREHOUSE = "sfWarehouse" - DBTABLE = "dbtable" - - def __init__(self,name): - ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_SNOWFLAKE) - ExtendedPipeGenerated.addProperty(self, "application", "zinggai_zingg") - - - def setURL(self, url): - """ Method to set url to the pipe - - :param url: provide url for this pipe - :type url: String - """ - ExtendedPipeGenerated.addProperty(self, "sfUrl", url) - - def setUser(self, user): - """ Method to set User to the pipe - - :param user: provide User parameter. - :type user: String - """ - ExtendedPipeGenerated.addProperty(self, "sfUser", user) - - def setPassword(self, passwd): - """ Method to set Password to the pipe - - :param passwd: provide Password parameter. - :type passwd: String - """ - ExtendedPipeGenerated.addProperty(self, "sfPassword", passwd) - - def setDatabase(self, db): - """ Method to set Database to the pipe - - :param db: provide Database parameter. - :type db: Database - """ - ExtendedPipeGenerated.addProperty(self, "sfDatabase", db) - - def setSFSchema(self, schema): - """ Method to set Schema to the pipe - - :param schema: provide schema parameter. - :type schema: Schema - """ - ExtendedPipeGenerated.addProperty(self, "sfSchema", schema) - - def setWarehouse(self, warehouse): - """ Method to set warehouse parameter to the pipe - - :param warehouse: provide warehouse parameter. - :type warehouse: String - """ - ExtendedPipeGenerated.addProperty(self, "sfWarehouse", warehouse) - - def setDbTable(self, dbtable): - """ description - - :param dbtable: provide bucket parameter. - :type dbtable: String - """ - ExtendedPipeGenerated.addProperty(self, "dbtable", dbtable) - - -class InMemoryPipe(ExtendedPipeGenerated): - """ Pipe Class for working with InMemory pipeline - - :param name: name of the pipe - :type name: String - :param df: provide dataset for this pipe (optional) - :type df: Dataset or None - """ - - def __init__(self, name, df = None): - ExtendedPipeGenerated.__init__(self, name, JPipe.FORMAT_INMEMORY) - if (df is not None): - self.setDataset(df) - - def setDataset(self, df): - """ Method to set DataFrame of the pipe - - :param df: pandas or spark dataframe for the pipe - :type df: DataFrame - """ - if (isinstance(df, pd.DataFrame)): - print('schema of pandas df is ' , ExtendedPipeGenerated.getPipe(self).getSchema()) - if (ExtendedPipeGenerated.getPipe(self).getSchema() is not None): - ds = getSparkSession().createDataFrame(df, schema=ExtendedPipeGenerated.getPipe(self).getSchema()) - else: - ds = getSparkSession().createDataFrame(df) - - ExtendedPipeGenerated.getPipe(self).setDataset(ds._jdf) - elif (isinstance(df, DataFrame)): - ExtendedPipeGenerated.getPipe(self).setDataset(df._jdf) - else: - LOG.error(" setDataset(): NUll or Unsupported type: %s", type(df)) - - def getDataset(self): - """ Method to get Dataset from pipe - - :return: dataset of the pipe in the format of spark dataset - :rtype: Dataset - """ - return ExtendedPipeGenerated.getPipe(self).getDataset().df() \ No newline at end of file From 259fb60eaea01f04bc9f5160d907d449b55b0970 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Thu, 7 Mar 2024 13:35:30 +0530 Subject: [PATCH 099/219] Revert "Update Annotations to generate code" This reverts commit 62e2933baba7f10cfdad1639313fcb18ea35607c. --- .../py/processors/PythonClassProcessor.java | 31 ++++++++++--------- .../py/processors/PythonMethodProcessor.java | 14 ++++----- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 0efd7bf25..45af2e2e5 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -53,14 +53,14 @@ public boolean process(Set annotations, RoundEnvironment // __init__ method System.out.println(" def __init__(self" + generateConstructorParameters(classElement) + "):"); - generateClassInitializationCode(classElement, element); - - // for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { - // if (!field.getSimpleName().contentEquals("serialVersionUID")) { - // generateFieldInitializationCode(field, element); - // } - // } - + if (element.getSimpleName().contentEquals("Pipe")) { + generateClassInitializationCode(classElement, element); + } + for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { + if (!field.getSimpleName().contentEquals("serialVersionUID")) { + generateFieldInitializationCode(field, element); + } + } for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { if (methodElement.getAnnotation(PythonMethod.class) != null) { methodNames.add(methodElement.getSimpleName().toString()); @@ -140,16 +140,17 @@ else if (element.getSimpleName().contentEquals("FieldDefinition")) { fileWriter.write(" return self.fielddefinition\n"); } fileWriter.write("\n"); + System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); } - // private void generateFieldInitializationCode(VariableElement field, Element element) { - // String fieldName = field.getSimpleName().toString(); - // String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; + private void generateFieldInitializationCode(VariableElement field, Element element) { + String fieldName = field.getSimpleName().toString(); + String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; - // if (!fieldName.startsWith("FORMAT_")) { - // System.out.println(" " + fieldAssignment); - // } - // } + if (!fieldName.startsWith("FORMAT_")) { + System.out.println(" " + fieldAssignment); + } + } private String generateConstructorParameters(TypeElement classElement) { StringBuilder parameters = new StringBuilder(); diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 183b6458d..5a499795f 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -7,7 +7,7 @@ import javax.lang.model.type.TypeMirror; import javax.lang.model.type.TypeKind; import java.util.Set; -// import java.util.logging.Logger; +import java.util.logging.Logger; import javax.lang.model.element.*; import zingg.common.py.annotations.*; @@ -38,8 +38,7 @@ public boolean process(Set annotations, RoundEnvironment if (methodNames.contains(methodElement.getSimpleName().toString())) { // LOG.info("Generating Python method for: " + methodElement.getSimpleName()); System.out.println(" def " + methodElement.getSimpleName() + "(self" + - generateMethodSignature(methodElement) + "):"); - generateMethodReturn(methodElement); + generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); generateFieldAssignment(methodElement); } } @@ -65,15 +64,14 @@ private String generateMethodParameters(ExecutableElement methodElement) { return parameters.toString(); } - private void generateMethodReturn(ExecutableElement methodElement) { + private String generateMethodReturn(ExecutableElement methodElement) { TypeMirror returnType = methodElement.getReturnType(); if (returnType.getKind() == TypeKind.VOID) { - return; + return ""; } else { String returnTypeString = resolveType(returnType); - String methodName = methodElement.getSimpleName().toString(); - String className = methodElement.getEnclosingElement().getSimpleName().toString(); - System.out.println(" return self." + className.toLowerCase() + "." + methodName + "()"); + String variableName = methodElement.getSimpleName().toString(); + return "return " + variableName; } } From c5707779ab4c0d9dcc4929ecb9df808a705d9048 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Thu, 7 Mar 2024 14:04:24 +0530 Subject: [PATCH 100/219] Revert "updated the setter functions in annotations" This reverts commit 17baef2ea8c9616bb0eeea510574ef2c30c25080. --- .../java/zingg/common/client/pipe/Pipe.java | 6 +- .../py/processors/ProcessorContext.java | 22 ------ .../py/processors/PythonClassProcessor.java | 72 ++++++------------- .../py/processors/PythonMethodProcessor.java | 56 +++++---------- 4 files changed, 37 insertions(+), 119 deletions(-) delete mode 100644 common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java diff --git a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java index aab0878b1..99e80c9cc 100644 --- a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java +++ b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java @@ -55,7 +55,7 @@ public String getSchema() { return schema; } - @PythonMethod + public void setSchema(String schema) { this.schema = schema; } @@ -71,12 +71,10 @@ public void setName(String name) { this.name = name; } - @PythonMethod public String getFormat() { return format; } - @PythonMethod @JsonValue public void setFormat(String sinkType) { this.format = sinkType; @@ -92,7 +90,6 @@ public Map getProps() { return props; } - @PythonMethod public void setProp(String k, String v) { if (props == null) props = new HashMap(); this.props.put(k, v); @@ -137,7 +134,6 @@ public void setDataset(ZFrame ds){ this.dataset = ds; } - @PythonMethod @Override public String toString() { StringRedactor redactor = new StringRedactor(); diff --git a/common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java b/common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java deleted file mode 100644 index 6131d51ed..000000000 --- a/common/py/src/main/java/zingg/common/py/processors/ProcessorContext.java +++ /dev/null @@ -1,22 +0,0 @@ -package zingg.common.py.processors; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class ProcessorContext { - private static final ProcessorContext INSTANCE = new ProcessorContext(); - - private Map> classMethodsMap = new HashMap<>(); - - private ProcessorContext() { - } - - public static ProcessorContext getInstance() { - return INSTANCE; - } - - public Map> getClassMethodsMap() { - return classMethodsMap; - } -} diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java index 45af2e2e5..76742e067 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java @@ -1,14 +1,10 @@ package zingg.common.py.processors; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; - import javax.annotation.processing.*; +import javax.lang.model.type.TypeMirror; +import javax.lang.model.type.TypeKind; import java.util.Set; -import java.util.stream.Collectors; - import javax.lang.model.element.*; import javax.lang.model.util.ElementFilter; @@ -18,12 +14,6 @@ public class PythonClassProcessor extends AbstractProcessor { private boolean importsAndDeclarationsGenerated = false; - private Map> classMethodsMap = new HashMap<>(); - - @Override - public synchronized void init(ProcessingEnvironment processingEnv) { - super.init(processingEnv); - } @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { @@ -48,37 +38,28 @@ public boolean process(Set annotations, RoundEnvironment try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) { generateImportsAndDeclarations(element, fileWriter); + PackageElement packageElement = + (PackageElement) classElement.getEnclosingElement(); System.out.println("class " + element.getSimpleName() + ":"); // __init__ method System.out.println(" def __init__(self" + generateConstructorParameters(classElement) + "):"); - if (element.getSimpleName().contentEquals("Pipe")) { - generateClassInitializationCode(classElement, element); + if (element.getSimpleName().contentEquals("pipe")) { + generateClassInitializationCode(classElement); } for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { if (!field.getSimpleName().contentEquals("serialVersionUID")) { - generateFieldInitializationCode(field, element); - } - } - for (ExecutableElement methodElement : ElementFilter.methodsIn(classElement.getEnclosedElements())) { - if (methodElement.getAnnotation(PythonMethod.class) != null) { - methodNames.add(methodElement.getSimpleName().toString()); + generateFieldInitializationCode(field); } } - classMethodsMap.put(element.getSimpleName().toString(), methodNames); } System.out.println(); - // rest of generated class contents + // rest of generated class contents } - ProcessorContext processorContext = ProcessorContext.getInstance(); - processorContext.getClassMethodsMap().putAll(classMethodsMap); return false; - } - Map> getClassMethodsMap() { - return classMethodsMap; } private String determineOutputDirectory(String packageName) { @@ -143,37 +124,24 @@ else if (element.getSimpleName().contentEquals("FieldDefinition")) { System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); } - private void generateFieldInitializationCode(VariableElement field, Element element) { - String fieldName = field.getSimpleName().toString(); - String fieldAssignment = "self." + element.getSimpleName().toString().toLowerCase() + "." + fieldName + " = " + fieldName; - - if (!fieldName.startsWith("FORMAT_")) { - System.out.println(" " + fieldAssignment); - } + // private void generateFieldInitializationCode(VariableElement field, ExecutableElement methodElement, TypeElement classElement) { + private void generateFieldInitializationCode(VariableElement field) { + System.out.println(" self.pipe." + field.getSimpleName() + " = " + field.getSimpleName()); + // String fieldName = field.getSimpleName().toString(); + // String methodName = methodElement.getSimpleName().toString(); + // System.out.println(" self." + fieldName + " = " + "getJVM()." + + // classElement.getQualifiedName().toString() + "." + methodName + "(" + fieldName + ")"); } private String generateConstructorParameters(TypeElement classElement) { StringBuilder parameters = new StringBuilder(); - List fields = ElementFilter.fieldsIn(classElement.getEnclosedElements()); - - fields = fields.stream() - .filter(field -> !field.getSimpleName().contentEquals("serialVersionUID")) - .filter(this::isFieldForConstructor) - .collect(Collectors.toList()); - - for (VariableElement field : fields) { - parameters.append(", "); - parameters.append(field.getSimpleName()); + for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { + if (!field.getSimpleName().contentEquals("serialVersionUID")) { + parameters.append(", "); + parameters.append(field.getSimpleName()); + } } return parameters.toString(); } - - private boolean isFieldForConstructor(VariableElement field) { - String fieldName = field.getSimpleName().toString(); - - return !fieldName.equals(fieldName.toUpperCase()) - && !field.getModifiers().contains(Modifier.STATIC) - && !fieldName.startsWith("FORMAT_"); - } } diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java index 5a499795f..fe0b02747 100644 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java @@ -1,51 +1,36 @@ package zingg.common.py.processors; import java.util.List; -import java.util.Map; - import javax.annotation.processing.*; import javax.lang.model.type.TypeMirror; import javax.lang.model.type.TypeKind; import java.util.Set; -import java.util.logging.Logger; - import javax.lang.model.element.*; +import javax.lang.model.util.ElementFilter; + import zingg.common.py.annotations.*; @SupportedAnnotationTypes("zingg.common.py.annotations.PythonMethod") public class PythonMethodProcessor extends AbstractProcessor { - private Map> classMethodsMap; - // private static final Logger LOG = Logger.getLogger(PythonMethodProcessor.class.getName()); - + private boolean importsAndDeclarationsGenerated = false; + @Override public boolean process(Set annotations, RoundEnvironment roundEnv) { - ProcessorContext processorContext = ProcessorContext.getInstance(); - classMethodsMap = processorContext.getClassMethodsMap(); - // LOG.info("Processing PythonMethod annotations..."); - // process Services annotation for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { if (element.getKind() == ElementKind.METHOD) { ExecutableElement methodElement = (ExecutableElement) element; - String className = methodElement.getEnclosingElement().getSimpleName().toString(); - - if (classMethodsMap.containsKey(className)) { - List methodNames = classMethodsMap.get(className); - - if (methodNames.contains(methodElement.getSimpleName().toString())) { - // LOG.info("Generating Python method for: " + methodElement.getSimpleName()); - System.out.println(" def " + methodElement.getSimpleName() + "(self" + - generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); - generateFieldAssignment(methodElement); - } - } + System.out.println(" def " + methodElement.getSimpleName() + "(self" + + generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); + generateFieldAssignment(methodElement); } - System.out.println(); + System.out.println(); + + // rest of generated class contents } - // LOG.info("Processing complete."); return false; } @@ -80,21 +65,12 @@ private String resolveType(TypeMirror typeMirror) { } private void generateFieldAssignment(ExecutableElement methodElement) { - List parameters = methodElement.getParameters(); - - if (!parameters.isEmpty()) { - String methodName = methodElement.getSimpleName().toString(); - String className = methodElement.getEnclosingElement().getSimpleName().toString(); - - StringBuilder parameterList = new StringBuilder(); - for (VariableElement parameter : parameters) { - if (parameterList.length() > 0) { - parameterList.append(", "); - } - parameterList.append(parameter.getSimpleName()); - } - System.out.println(" self." + className.toLowerCase() + "." + methodName + "(" + parameterList + ")"); - } + List parameters = methodElement.getParameters(); + if (!parameters.isEmpty()) { + VariableElement parameter = parameters.get(0); + String variableName = parameter.getSimpleName().toString(); + System.out.println(" self." + variableName + " = " + variableName); } +} } From 0398394c953f90c2cfec7db86c17c00550731e5b Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Thu, 7 Mar 2024 16:25:09 +0530 Subject: [PATCH 101/219] Revert "Annotations to Generate py codes" This reverts commit fa65b3e6a091cf2b8c6c73243d3dcbcb7acfad74. --- common/client/pom.xml | 28 ---- .../java/zingg/common/client/pipe/Pipe.java | 9 +- common/pom.xml | 1 - common/py/pom.xml | 10 -- .../common/py/annotations/PythonClass.java | 9 -- .../common/py/annotations/PythonMethod.java | 9 -- .../py/processors/PythonClassProcessor.java | 147 ------------------ .../py/processors/PythonMethodProcessor.java | 76 --------- 8 files changed, 3 insertions(+), 286 deletions(-) delete mode 100644 common/py/pom.xml delete mode 100644 common/py/src/main/java/zingg/common/py/annotations/PythonClass.java delete mode 100644 common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java delete mode 100644 common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java delete mode 100644 common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java diff --git a/common/client/pom.xml b/common/client/pom.xml index d8e3bafd3..c67339949 100644 --- a/common/client/pom.xml +++ b/common/client/pom.xml @@ -8,38 +8,10 @@ zingg-common-client jar - - zingg - zingg-common-py - ${zingg.version} - javax.mail mail 1.4 - - - - - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${maven.compiler.source} - ${maven.compiler.source} - true - - - - zingg.common.py.processors.PythonClassProcessor - - - zingg.common.py.processors.PythonMethodProcessor - - - - - - diff --git a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java index 99e80c9cc..7a4f8ff88 100644 --- a/common/client/src/main/java/zingg/common/client/pipe/Pipe.java +++ b/common/client/src/main/java/zingg/common/client/pipe/Pipe.java @@ -11,16 +11,13 @@ import zingg.common.client.ZFrame; import zingg.common.client.util.StringRedactor; -import zingg.common.py.annotations.PythonClass; -import zingg.common.py.annotations.PythonMethod; - /**Actual pipe def in the args. One pipe can be used at multiple places with different tables, locations, queries etc * * @author sgoyal * */ -@PythonClass + @JsonInclude(Include.NON_NULL) public class Pipe implements Serializable{ // St:StructType, Sv:SaveMode @@ -60,12 +57,12 @@ public void setSchema(String schema) { this.schema = schema; } - @PythonMethod + public String getName() { return name; } - @PythonMethod + @JsonValue public void setName(String name) { this.name = name; diff --git a/common/pom.xml b/common/pom.xml index c50c2b037..23dd19064 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -12,6 +12,5 @@ infra core client - py diff --git a/common/py/pom.xml b/common/py/pom.xml deleted file mode 100644 index bde63932b..000000000 --- a/common/py/pom.xml +++ /dev/null @@ -1,10 +0,0 @@ - - 4.0.0 - - zingg - zingg-common - ${zingg.version} - - zingg-common-py - \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java b/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java deleted file mode 100644 index 0d3bf21a5..000000000 --- a/common/py/src/main/java/zingg/common/py/annotations/PythonClass.java +++ /dev/null @@ -1,9 +0,0 @@ -package zingg.common.py.annotations; - -import javax.annotation.processing.*; - -import java.lang.annotation.Target; -import java.lang.annotation.ElementType; - -@Target({ElementType.TYPE}) -public @interface PythonClass {} \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java b/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java deleted file mode 100644 index f59a9c038..000000000 --- a/common/py/src/main/java/zingg/common/py/annotations/PythonMethod.java +++ /dev/null @@ -1,9 +0,0 @@ -package zingg.common.py.annotations; - -import javax.annotation.processing.*; - -import java.lang.annotation.Target; -import java.lang.annotation.ElementType; - -@Target({ElementType.METHOD}) -public @interface PythonMethod {} \ No newline at end of file diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java deleted file mode 100644 index 76742e067..000000000 --- a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java +++ /dev/null @@ -1,147 +0,0 @@ -package zingg.common.py.processors; - -import java.util.List; -import javax.annotation.processing.*; -import javax.lang.model.type.TypeMirror; -import javax.lang.model.type.TypeKind; -import java.util.Set; -import javax.lang.model.element.*; -import javax.lang.model.util.ElementFilter; - -import zingg.common.py.annotations.*; - -@SupportedAnnotationTypes("zingg.common.py.annotations.PythonClass") -public class PythonClassProcessor extends AbstractProcessor { - - private boolean importsAndDeclarationsGenerated = false; - - @Override - public boolean process(Set annotations, RoundEnvironment roundEnv) { - - // Imports and global declarations - if (!importsAndDeclarationsGenerated) { - generateImportsAndDeclarations(); - importsAndDeclarationsGenerated = true; - } - - - // process Services annotation - for (Element element : roundEnv.getElementsAnnotatedWith(PythonClass.class)) { - if (element.getKind() == ElementKind.CLASS) { - TypeElement classElement = (TypeElement) element; - PackageElement packageElement = (PackageElement) classElement.getEnclosingElement(); - String packageName = packageElement.getQualifiedName().toString(); - List methodNames = new ArrayList<>(); - - String outputDirectory = determineOutputDirectory(packageName); - - try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) { - generateImportsAndDeclarations(element, fileWriter); - - PackageElement packageElement = - (PackageElement) classElement.getEnclosingElement(); - System.out.println("class " + element.getSimpleName() + ":"); - - // __init__ method - System.out.println(" def __init__(self" + - generateConstructorParameters(classElement) + "):"); - if (element.getSimpleName().contentEquals("pipe")) { - generateClassInitializationCode(classElement); - } - for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { - if (!field.getSimpleName().contentEquals("serialVersionUID")) { - generateFieldInitializationCode(field); - } - } - } - System.out.println(); - // rest of generated class contents - } - - return false; - - } - - private String determineOutputDirectory(String packageName) { - if (packageName.contains("enterprise") && packageName.contains("common")) { - return "common/python"; - } else if (packageName.contains("enterprise") && packageName.contains("snowflake")) { - return "snowflake/python"; - } else if (packageName.contains("enterprise") && packageName.contains("spark")) { - return "spark/python"; - } else { - return "python/zingg"; - } - } - - private void generateImportsAndDeclarations(Element element, FileWriter fileWriter) throws IOException { - fileWriter.write("from zingg.otherThanGenerated import *\n"); - if (element.getSimpleName().contentEquals("Pipe")) { - fileWriter.write("import logging\n"); - fileWriter.write("LOG = logging.getLogger(\"zingg.pipes\")\n"); - fileWriter.write("\n"); - fileWriter.write("JPipe = getJVM().zingg.spark.client.pipe.SparkPipe\n"); - fileWriter.write("FilePipe = getJVM().zingg.common.client.pipe.FilePipe\n"); - fileWriter.write("JStructType = getJVM().org.apache.spark.sql.types.StructType\n"); - fileWriter.write("\n"); - } - private void generateImportsAndDeclarations() { - System.out.println("import logging"); - System.out.println("from zingg.client import *"); - System.out.println("LOG = logging.getLogger(\"zingg.pipes\")"); - System.out.println(); - System.out.println("JPipe = getJVM().zingg.spark.client.pipe.SparkPipe"); - System.out.println("FilePipe = getJVM().zingg.common.client.pipe.FilePipe"); - System.out.println("JStructType = getJVM().org.apache.spark.sql.types.StructType"); - System.out.println(); - } - - private void generateClassInitializationCode(TypeElement classElement, Element element) { - if (element.getSimpleName().contentEquals("Pipe")) { - System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); - } - else if (element.getSimpleName().contentEquals("EPipe")) { - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()\n"); - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setPassthroughExpr(passthroughExpr)\n"); - } - else if (element.getSimpleName().contentEquals("Arguments")) { - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n"); - } - else if (element.getSimpleName().contentEquals("EArguments")) { - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n"); - } - else if (element.getSimpleName().contentEquals("FieldDefinition")) { - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.FieldDefinition()\n"); - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFieldName(name)\n"); - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setDataType(self.stringify(dataType))\n"); - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setMatchType(matchType)\n"); - fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFields(name)\n"); - fileWriter.write("\n"); - fileWriter.write(" def getFieldDefinition(self):\n"); - fileWriter.write(" return self.fielddefinition\n"); - } - fileWriter.write("\n"); - System.out.println(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()"); - } - - // private void generateFieldInitializationCode(VariableElement field, ExecutableElement methodElement, TypeElement classElement) { - private void generateFieldInitializationCode(VariableElement field) { - System.out.println(" self.pipe." + field.getSimpleName() + " = " + field.getSimpleName()); - // String fieldName = field.getSimpleName().toString(); - // String methodName = methodElement.getSimpleName().toString(); - // System.out.println(" self." + fieldName + " = " + "getJVM()." + - // classElement.getQualifiedName().toString() + "." + methodName + "(" + fieldName + ")"); - } - - private String generateConstructorParameters(TypeElement classElement) { - StringBuilder parameters = new StringBuilder(); - for (VariableElement field : ElementFilter.fieldsIn(classElement.getEnclosedElements())) { - if (!field.getSimpleName().contentEquals("serialVersionUID")) { - parameters.append(", "); - parameters.append(field.getSimpleName()); - } - } - return parameters.toString(); - } - -} diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java deleted file mode 100644 index fe0b02747..000000000 --- a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java +++ /dev/null @@ -1,76 +0,0 @@ -package zingg.common.py.processors; - -import java.util.List; -import javax.annotation.processing.*; -import javax.lang.model.type.TypeMirror; -import javax.lang.model.type.TypeKind; -import java.util.Set; -import javax.lang.model.element.*; -import javax.lang.model.util.ElementFilter; - -import zingg.common.py.annotations.*; - -@SupportedAnnotationTypes("zingg.common.py.annotations.PythonMethod") -public class PythonMethodProcessor extends AbstractProcessor { - - private boolean importsAndDeclarationsGenerated = false; - - @Override - public boolean process(Set annotations, RoundEnvironment roundEnv) { - - // process Services annotation - for (Element element : roundEnv.getElementsAnnotatedWith(PythonMethod.class)) { - - if (element.getKind() == ElementKind.METHOD) { - ExecutableElement methodElement = (ExecutableElement) element; - System.out.println(" def " + methodElement.getSimpleName() + "(self" + - generateMethodSignature(methodElement) + "):\n " + generateMethodReturn(methodElement)); - generateFieldAssignment(methodElement); - } - System.out.println(); - - // rest of generated class contents - } - return false; - } - - private String generateMethodSignature(ExecutableElement methodElement) { - StringBuilder signature = new StringBuilder(); - signature.append(generateMethodParameters(methodElement)); - return signature.toString(); - } - - private String generateMethodParameters(ExecutableElement methodElement) { - StringBuilder parameters = new StringBuilder(); - for (VariableElement parameter : methodElement.getParameters()) { - parameters.append(", "); - parameters.append(parameter.getSimpleName()); - } - return parameters.toString(); - } - - private String generateMethodReturn(ExecutableElement methodElement) { - TypeMirror returnType = methodElement.getReturnType(); - if (returnType.getKind() == TypeKind.VOID) { - return ""; - } else { - String returnTypeString = resolveType(returnType); - String variableName = methodElement.getSimpleName().toString(); - return "return " + variableName; - } - } - - private String resolveType(TypeMirror typeMirror) { - return typeMirror.toString(); - } - - private void generateFieldAssignment(ExecutableElement methodElement) { - List parameters = methodElement.getParameters(); - if (!parameters.isEmpty()) { - VariableElement parameter = parameters.get(0); - String variableName = parameter.getSimpleName().toString(); - System.out.println(" self." + variableName + " = " + variableName); - } -} - -} From 3e37b6d93f856e306f6941514a2443388d19a72a Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Thu, 7 Mar 2024 16:56:48 +0530 Subject: [PATCH 102/219] fixed missing import of Named interface --- .../src/main/java/zingg/common/client/FieldDefinition.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index 0e3ad4d99..3d89e9756 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -22,6 +22,8 @@ import com.fasterxml.jackson.databind.deser.std.StdDeserializer; import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import zingg.common.client.cols.Named; + /** * This class defines each field that we use in matching We can use this to From ee6027e40788410b7b67796dd30e46560f00f033 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sat, 9 Mar 2024 13:14:25 +0530 Subject: [PATCH 103/219] field def added method isUnused --- .../src/main/java/zingg/common/client/FieldDefinition.java | 5 +++-- .../java/zingg/common/client/cols/FieldDefSelectedCols.java | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index 3d89e9756..676829a88 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -10,6 +10,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonProcessingException; @@ -121,9 +122,9 @@ public void setFieldName(String fieldName) { this.fieldName = fieldName; } + @JsonIgnore public boolean isDontUse() { - // TODO Auto-generated method stub - throw new UnsupportedOperationException("Unimplemented method 'isDontUse'"); + return (matchType != null && matchType.contains(MatchType.DONT_USE)); } @Override diff --git a/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java index f0cf06f86..af5f615a0 100644 --- a/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java +++ b/common/client/src/main/java/zingg/common/client/cols/FieldDefSelectedCols.java @@ -25,7 +25,7 @@ protected List getColList(List fieldDefs, boo List namedList = new ArrayList(); for (FieldDefinition fieldDef : fieldDefs) { - if (showConcise && fieldDef.matchType.contains(MatchType.DONT_USE)) { + if (showConcise && fieldDef.isDontUse()) { continue; } namedList.add(fieldDef); From 7df3540f5cfd591f53ee1e05e8724d205e026138 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Sat, 9 Mar 2024 15:41:15 +0100 Subject: [PATCH 104/219] Make sphinx works A temporary solution, based on the following: - introducing a new env variable ZINGG_DRY_RUN - if the variable is set: + mimic globally used JVM-stuff + otherwise do nothing ++ slightly update ignore and docs/Makefile ++ apply formatting to client.py On branch 762-fix_sphinx_build Changes to be committed: modified: .gitignore modified: python/docs/Makefile new file: python/pyproject.toml modified: python/zingg/client.py --- .gitignore | 7 + python/docs/Makefile | 2 + python/pyproject.toml | 2 + python/zingg/client.py | 450 +++++++++++++++++++++++++---------------- 4 files changed, 286 insertions(+), 175 deletions(-) create mode 100644 python/pyproject.toml diff --git a/.gitignore b/.gitignore index 18b75246c..4a3c2fe76 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,10 @@ python/docs/_build/_doctrees **/python/build/* **/assembly/.classpath **/.DS_Store + +# Python stuff +.env +.venv + +# Sphinx _build +**/_build diff --git a/python/docs/Makefile b/python/docs/Makefile index d4bb2cbb9..9847005dd 100644 --- a/python/docs/Makefile +++ b/python/docs/Makefile @@ -17,4 +17,6 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile + export ZINGG_DRY_RUN=1 @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + unset ZINGG_DRY_RUN diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 000000000..16637529c --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,2 @@ +[tool.ruff] +line-length = 150 diff --git a/python/zingg/client.py b/python/zingg/client.py index 3c748a978..6126bbcfa 100644 --- a/python/zingg/client.py +++ b/python/zingg/client.py @@ -4,108 +4,183 @@ This module is the main entry point of the Zingg Python API """ -import logging +from __future__ import annotations import argparse -import pandas as pd -from pyspark.sql import DataFrame +import logging +import os +from typing import Any -from pyspark import SparkConf, SparkContext, SQLContext +import pandas as pd +from pyspark import SparkContext, SQLContext +from pyspark.sql import DataFrame, SparkSession -from py4j.java_collections import SetConverter, MapConverter, ListConverter -from pyspark.sql import SparkSession -import os LOG = logging.getLogger("zingg") _spark_ctxt = None _sqlContext = None _spark = None -_zingg_jar = 'zingg-0.4.0.jar' +_zingg_jar = "zingg-0.4.0.jar" + def initSparkClient(): global _spark_ctxt global _sqlContext - global _spark + global _spark _spark_ctxt = SparkContext.getOrCreate() _sqlContext = SQLContext(_spark_ctxt) _spark = SparkSession.builder.getOrCreate() return 1 + def initDataBricksConectClient(): global _spark_ctxt global _sqlContext - global _spark - jar_path = os.getenv('ZINGG_HOME')+'/'+_zingg_jar - _spark = SparkSession.builder.config('spark.jars', jar_path).getOrCreate() + global _spark + jar_path = os.getenv("ZINGG_HOME") + "/" + _zingg_jar + _spark = SparkSession.builder.config("spark.jars", jar_path).getOrCreate() _spark_ctxt = _spark.sparkContext _sqlContext = SQLContext(_spark_ctxt) return 1 + def initClient(): global _spark_ctxt global _sqlContext - global _spark + global _spark if _spark_ctxt is None: - DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT') - if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y': + DATABRICKS_CONNECT = os.getenv("DATABRICKS_CONNECT") + if DATABRICKS_CONNECT == "Y" or DATABRICKS_CONNECT == "y": return initDataBricksConectClient() else: return initSparkClient() else: return 1 + def getSparkContext(): if _spark_ctxt is None: initClient() return _spark_ctxt + def getSparkSession(): if _spark is None: initClient() return _spark + def getSqlContext(): if _sqlContext is None: initClient() return _sqlContext + def getJVM(): + # TODO: Document this environ variable + is_dry_run = os.environ.get("ZINGG_DRY_RUN", 0) + if is_dry_run: + + class Dummy: + """Dummy class for handling JVM-magick without actual starting of Java""" + + def __init__(self, attrs: dict[str, Any]): + for k, v in attrs.items(): + self.__setattr__(k, v) + + # TODO: replace this magic by Context-like implementation + return Dummy( + { + "org": Dummy({"apache": Dummy({"spark": Dummy({"sql": Dummy({"types": Dummy({"StructType": None})})})})}), + "zingg": Dummy( + { + "common": Dummy( + { + "client": Dummy( + { + "util": Dummy({"ColName": None}), + "MatchType": None, + "ClientOptions": Dummy( + { + "PHASE": None, + "CONF": None, + "LICENSE": None, + "EMAIL": None, + "LOCATION": None, + "REMOTE": None, + "ZINGG_DIR": None, + "MODEL_ID": None, + "COLUMN": None, + } + ), + "ZinggOptions": None, + "pipe": Dummy( + { + "FilePipe": None, + } + ), + } + ), + "core": Dummy({"util": Dummy({"LabelMatchType": None})}), + } + ), + "spark": Dummy( + { + "client": Dummy( + { + "pipe": Dummy( + { + "SparkPipe": None, + } + ) + } + ) + } + ), + } + ), + } + ) return getSparkContext()._jvm + def getGateway(): return getSparkContext()._gateway + ColName = getJVM().zingg.common.client.util.ColName MatchType = getJVM().zingg.common.client.MatchType ClientOptions = getJVM().zingg.common.client.ClientOptions ZinggOptions = getJVM().zingg.common.client.ZinggOptions LabelMatchType = getJVM().zingg.common.core.util.LabelMatchType -UpdateLabelMode = 'Overwrite' +UpdateLabelMode = "Overwrite" + def getDfFromDs(data): - """ Method to convert spark dataset to dataframe + """Method to convert spark dataset to dataframe :param data: provide spark dataset :type data: DataSet :return: converted spark dataframe - :rtype: DataFrame + :rtype: DataFrame """ return DataFrame(data.df(), getSqlContext()) + def getPandasDfFromDs(data): - """ Method to convert spark dataset to pandas dataframe + """Method to convert spark dataset to pandas dataframe :param data: provide spark dataset :type data: DataSet :return: converted pandas dataframe - :rtype: DataFrame + :rtype: DataFrame """ df = getDfFromDs(data) return pd.DataFrame(df.collect(), columns=df.columns) class Zingg: - """ This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. + """This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. :param args: arguments for training and matching :type args: Arguments @@ -118,25 +193,25 @@ def __init__(self, args, options): self.inpArgs = args self.inpOptions = options self.client = getJVM().zingg.spark.client.SparkClient(args.getArgs(), options.getClientOptions()) - + def init(self): - """ Method to initialize zingg client by reading internal configurations and functions """ + """Method to initialize zingg client by reading internal configurations and functions""" self.client.init() def execute(self): - """ Method to execute this class object """ + """Method to execute this class object""" self.client.execute() - + def initAndExecute(self): - """ Method to run both init and execute methods consecutively """ + """Method to run both init and execute methods consecutively""" self.client.init() - DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT') - if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y': + DATABRICKS_CONNECT = os.getenv("DATABRICKS_CONNECT") + if DATABRICKS_CONNECT == "Y" or DATABRICKS_CONNECT == "y": options = self.client.getOptions() inpPhase = options.get(ClientOptions.PHASE).getValue() - if (inpPhase==ZinggOptions.LABEL.getValue()): + if inpPhase == ZinggOptions.LABEL.getValue(): self.executeLabel() - elif (inpPhase==ZinggOptions.UPDATE_LABEL.getValue()): + elif inpPhase == ZinggOptions.UPDATE_LABEL.getValue(): self.executeLabelUpdate() else: self.client.execute() @@ -144,43 +219,48 @@ def initAndExecute(self): self.client.execute() def executeLabel(self): - """ Method to run label phase """ + """Method to run label phase""" self.client.getTrainingDataModel().setMarkedRecordsStat(self.getMarkedRecords()) unmarkedRecords = self.getUnmarkedRecords() - updatedRecords = self.processRecordsCli(unmarkedRecords,self.inpArgs) - self.writeLabelledOutput(updatedRecords,self.inpArgs) + updatedRecords = self.processRecordsCli(unmarkedRecords, self.inpArgs) + self.writeLabelledOutput(updatedRecords, self.inpArgs) def executeLabelUpdate(self): - """ Method to run label update phase """ - self.processRecordsCliLabelUpdate(self.getMarkedRecords(),self.inpArgs) + """Method to run label update phase""" + self.processRecordsCliLabelUpdate(self.getMarkedRecords(), self.inpArgs) def getMarkedRecords(self): - """ Method to get marked record dataset from the inputpipe + """Method to get marked record dataset from the inputpipe :return: spark dataset containing marked records - :rtype: Dataset + :rtype: Dataset """ return self.client.getMarkedRecords() def getUnmarkedRecords(self): - """ Method to get unmarked record dataset from the inputpipe + """Method to get unmarked record dataset from the inputpipe :return: spark dataset containing unmarked records - :rtype: Dataset + :rtype: Dataset """ return self.client.getUnmarkedRecords() - def processRecordsCli(self,unmarkedRecords,args): - """ Method to get user input on unmarked records + def processRecordsCli(self, unmarkedRecords, args): + """Method to get user input on unmarked records :return: spark dataset containing updated records - :rtype: Dataset + :rtype: Dataset """ trainingDataModel = self.client.getTrainingDataModel() labelDataViewHelper = self.client.getLabelDataViewHelper() if unmarkedRecords is not None and unmarkedRecords.count() > 0: - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) + labelDataViewHelper.printMarkedRecordsStat( + trainingDataModel.getPositivePairsCount(), + trainingDataModel.getNegativePairsCount(), + trainingDataModel.getNotSurePairsCount(), + trainingDataModel.getTotalCount(), + ) unmarkedRecords = unmarkedRecords.cache() displayCols = labelDataViewHelper.getDisplayColumns(unmarkedRecords, args.getArgs()) clusterIdZFrame = labelDataViewHelper.getClusterIdsFrame(unmarkedRecords) @@ -195,37 +275,53 @@ def processRecordsCli(self,unmarkedRecords,args): msg1 = labelDataViewHelper.getMsg1(index, totalPairs) msg2 = labelDataViewHelper.getMsg2(prediction, score) - labelDataViewHelper.displayRecords(labelDataViewHelper.getDSUtil().select(currentPair, displayCols), msg1, msg2) + labelDataViewHelper.displayRecords( + labelDataViewHelper.getDSUtil().select(currentPair, displayCols), + msg1, + msg2, + ) selected_option = input() - while int(selected_option) not in [0,1,2,9]: - print('Please enter valid option') + while int(selected_option) not in [0, 1, 2, 9]: + print("Please enter valid option") selected_option = input("Enter choice: ") if int(selected_option) == 9: print("User has quit in the middle. Updating the records.") - break + break trainingDataModel.updateLabellerStat(int(selected_option), 1) - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) - updatedRecords = trainingDataModel.updateRecords(int(selected_option), currentPair, updatedRecords) + labelDataViewHelper.printMarkedRecordsStat( + trainingDataModel.getPositivePairsCount(), + trainingDataModel.getNegativePairsCount(), + trainingDataModel.getNotSurePairsCount(), + trainingDataModel.getTotalCount(), + ) + updatedRecords = trainingDataModel.updateRecords(int(selected_option), currentPair, updatedRecords) print("Processing finished.") return updatedRecords else: - print("It seems there are no unmarked records at this moment. Please run findTrainingData job to build some pairs to be labelled and then run this labeler.") + print( + "It seems there are no unmarked records at this moment. Please run findTrainingData job to build some pairs to be labelled and then run this labeler." + ) return None - - def processRecordsCliLabelUpdate(self,lines,args): + + def processRecordsCliLabelUpdate(self, lines, args): trainingDataModel = self.client.getTrainingDataModel() labelDataViewHelper = self.client.getLabelDataViewHelper() - if (lines is not None and lines.count() > 0): + if lines is not None and lines.count() > 0: trainingDataModel.setMarkedRecordsStat(lines) - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) + labelDataViewHelper.printMarkedRecordsStat( + trainingDataModel.getPositivePairsCount(), + trainingDataModel.getNegativePairsCount(), + trainingDataModel.getNotSurePairsCount(), + trainingDataModel.getTotalCount(), + ) displayCols = labelDataViewHelper.getDSUtil().getFieldDefColumns(lines, args.getArgs(), False, args.getArgs().getShowConcise()) updatedRecords = None recordsToUpdate = lines selectedOption = -1 - while (str(selectedOption) != '9'): + while str(selectedOption) != "9": cluster_id = input("\n\tPlease enter the cluster id (or 9 to exit): ") - if str(cluster_id) == '9': + if str(cluster_id) == "9": print("User has exit in the middle. Updating the records.") break currentPair = lines.filter(lines.equalTo(ColName.CLUSTER_COLUMN, cluster_id)) @@ -233,23 +329,32 @@ def processRecordsCliLabelUpdate(self,lines,args): print("\tInvalid cluster id. Enter '9' to exit") continue - matchFlag = currentPair.getAsInt(currentPair.head(),ColName.MATCH_FLAG_COL) - preMsg = "\n\tThe record pairs belonging to the input cluster id "+cluster_id+" are:" - postMsg = "\tThe above pair is labeled as "+str(matchFlag)+"\n" - labelDataViewHelper.displayRecords(labelDataViewHelper.getDSUtil().select(currentPair, displayCols), preMsg, postMsg) + matchFlag = currentPair.getAsInt(currentPair.head(), ColName.MATCH_FLAG_COL) + preMsg = "\n\tThe record pairs belonging to the input cluster id " + cluster_id + " are:" + postMsg = "\tThe above pair is labeled as " + str(matchFlag) + "\n" + labelDataViewHelper.displayRecords( + labelDataViewHelper.getDSUtil().select(currentPair, displayCols), + preMsg, + postMsg, + ) selectedOption = input() trainingDataModel.updateLabellerStat(int(selectedOption), 1) trainingDataModel.updateLabellerStat(matchFlag, -1) - labelDataViewHelper.printMarkedRecordsStat(trainingDataModel.getPositivePairsCount(),trainingDataModel.getNegativePairsCount(),trainingDataModel.getNotSurePairsCount(),trainingDataModel.getTotalCount()) - - if (str(selectedOption) == '9'): + labelDataViewHelper.printMarkedRecordsStat( + trainingDataModel.getPositivePairsCount(), + trainingDataModel.getNegativePairsCount(), + trainingDataModel.getNotSurePairsCount(), + trainingDataModel.getTotalCount(), + ) + + if str(selectedOption) == "9": print("User has quit in the middle. Updating the records.") break - recordsToUpdate = recordsToUpdate.filter(recordsToUpdate.notEqual(ColName.CLUSTER_COLUMN,cluster_id)) + recordsToUpdate = recordsToUpdate.filter(recordsToUpdate.notEqual(ColName.CLUSTER_COLUMN, cluster_id)) - if (updatedRecords is not None): - updatedRecords = updatedRecords.filter(updatedRecords.notEqual(ColName.CLUSTER_COLUMN,cluster_id)) + if updatedRecords is not None: + updatedRecords = updatedRecords.filter(updatedRecords.notEqual(ColName.CLUSTER_COLUMN, cluster_id)) updatedRecords = trainingDataModel.updateRecords(int(selectedOption), currentPair, updatedRecords) @@ -259,32 +364,32 @@ def processRecordsCliLabelUpdate(self,lines,args): outPipe = trainingDataModel.getOutputPipe(args.getArgs()) outPipe.setMode(UpdateLabelMode) - trainingDataModel.writeLabelledOutput(updatedRecords,args.getArgs(),outPipe) + trainingDataModel.writeLabelledOutput(updatedRecords, args.getArgs(), outPipe) print("Processing finished.") return updatedRecords else: print("There is no marked record for updating. Please run findTrainingData/label jobs to generate training data.") return None - - def writeLabelledOutput(self,updatedRecords,args): - """ Method to write updated records after user input - """ + def writeLabelledOutput(self, updatedRecords, args): + """Method to write updated records after user input""" trainingDataModel = self.client.getTrainingDataModel() if updatedRecords is not None: - trainingDataModel.writeLabelledOutput(updatedRecords,args.getArgs()) + trainingDataModel.writeLabelledOutput(updatedRecords, args.getArgs()) - def writeLabelledOutputFromPandas(self,candidate_pairs_pd,args): - """ Method to write updated records (as pandas df) after user input - """ + def writeLabelledOutputFromPandas(self, candidate_pairs_pd, args): + """Method to write updated records (as pandas df) after user input""" markedRecordsAsDS = (getSparkSession().createDataFrame(candidate_pairs_pd))._jdf # pands df gives z_isMatch as long so needs to be cast - markedRecordsAsDS = markedRecordsAsDS.withColumn(ColName.MATCH_FLAG_COL,markedRecordsAsDS.col(ColName.MATCH_FLAG_COL).cast("int")) + markedRecordsAsDS = markedRecordsAsDS.withColumn( + ColName.MATCH_FLAG_COL, + markedRecordsAsDS.col(ColName.MATCH_FLAG_COL).cast("int"), + ) updatedRecords = getJVM().zingg.spark.client.SparkFrame(markedRecordsAsDS) - self.writeLabelledOutput(updatedRecords,args) + self.writeLabelledOutput(updatedRecords, args) def setArguments(self, args): - """ Method to set Arguments + """Method to set Arguments :param args: provide arguments for this class object :type args: Arguments @@ -292,7 +397,7 @@ def setArguments(self, args): self.client.setArguments() def getArguments(self): - """ Method to get atguments of this class object + """Method to get atguments of this class object :return: The pointer containing address of the Arguments object of this class object :rtype: pointer(Arguments) @@ -300,7 +405,7 @@ def getArguments(self): return self.client.getArguments() def getOptions(self): - """ Method to get client options of this class object + """Method to get client options of this class object :return: The pointer containing the address of the ClientOptions object of this class object :rtype: pointer(ClientOptions) @@ -308,56 +413,55 @@ def getOptions(self): return self.client.getOptions() def setOptions(self, options): - """ Method to set atguments of this class object + """Method to set atguments of this class object :param options: provide client options for this class object :type options: ClientOptions :return: The pointer containing address of the ClientOptions object of this class object - :rtype: pointer(ClientOptions) + :rtype: pointer(ClientOptions) """ return self.client.setOptions(options) def getMarkedRecordsStat(self, markedRecords, value): - """ Method to get No. of records that is marked + """Method to get No. of records that is marked :param markedRecords: spark dataset containing marked records :type markedRecords: Dataset :param value: flag value to check if markedRecord is initially matched or not :type value: long :return: The no. of marked records - :rtype: int + :rtype: int """ return self.client.getMarkedRecordsStat(markedRecords, value) def getMatchedMarkedRecordsStat(self): - """ Method to get No. of records that are marked and matched + """Method to get No. of records that are marked and matched :return: The bo. of matched marked records - :rtype: int + :rtype: int """ return self.client.getMatchedMarkedRecordsStat(self.getMarkedRecords()) def getUnmatchedMarkedRecordsStat(self): - """ Method to get No. of records that are marked and unmatched + """Method to get No. of records that are marked and unmatched :return: The no. of unmatched marked records - :rtype: int + :rtype: int """ return self.client.getUnmatchedMarkedRecordsStat(self.getMarkedRecords()) def getUnsureMarkedRecordsStat(self): - """ Method to get No. of records that are marked and Not Sure if its matched or not + """Method to get No. of records that are marked and Not Sure if its matched or not :return: The no. of Not Sure marked records - :rtype: int + :rtype: int """ return self.client.getUnsureMarkedRecordsStat(self.getMarkedRecords()) - class ZinggWithSpark(Zingg): - """ This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. + """This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. :param args: arguments for training and matching :type args: Arguments @@ -369,9 +473,9 @@ class ZinggWithSpark(Zingg): def __init__(self, args, options): self.client = getJVM().zingg.spark.client.SparkClient(args.getArgs(), options.getClientOptions(), getSparkSession()._jsparkSession) - + class Arguments: - """ This class helps supply match arguments to Zingg. There are 3 basic steps in any match process. + """This class helps supply match arguments to Zingg. There are 3 basic steps in any match process. :Defining: specifying information about data location, fields, and our notion of similarity. :training: making Zingg learn the matching rules @@ -382,7 +486,7 @@ def __init__(self): self.args = getJVM().zingg.common.client.Arguments() def setFieldDefinition(self, fieldDef): - """ Method convert python objects to java FieldDefinition objects and set the field definitions associated with this client + """Method convert python objects to java FieldDefinition objects and set the field definitions associated with this client :param fieldDef: python FieldDefinition object list :type fieldDef: List(FieldDefinition) @@ -393,16 +497,16 @@ def setFieldDefinition(self, fieldDef): self.args.setFieldDefinition(javaFieldDef) def getArgs(self): - """ Method to get pointer address of this class + """Method to get pointer address of this class :return: The pointer containing the address of this class object :rtype: pointer(Arguments) - + """ return self.args def setArgs(self, argumentsObj): - """ Method to set this class object + """Method to set this class object :param argumentsObj: Argument object to set this object :type argumentsObj: pointer(Arguments) @@ -410,7 +514,7 @@ def setArgs(self, argumentsObj): self.args = argumentsObj def setData(self, *pipes): - """ Method to set the file path of the file to be matched. + """Method to set the file path of the file to be matched. :param pipes: input data pipes separated by comma e.g. (pipe1,pipe2,..) :type pipes: Pipe[] @@ -421,7 +525,7 @@ def setData(self, *pipes): self.args.setData(dataPipe) def setOutput(self, *pipes): - """ Method to set the output directory where the match result will be saved + """Method to set the output directory where the match result will be saved :param pipes: output data pipes separated by comma e.g. (pipe1,pipe2,..) :type pipes: Pipe[] @@ -430,33 +534,33 @@ def setOutput(self, *pipes): for idx, pipe in enumerate(pipes): outputPipe[idx] = pipe.getPipe() self.args.setOutput(outputPipe) - + def getZinggBaseModelDir(self): return self.args.getZinggBaseModelDir() def getZinggModelDir(self): return self.args.getZinggModelDir() - + def getZinggBaseTrainingDataDir(self): - """ Method to get the location of the folder where Zingg - saves the training data found by findTrainingData + """Method to get the location of the folder where Zingg + saves the training data found by findTrainingData """ return self.args.getZinggBaseTrainingDataDir() def getZinggTrainingDataUnmarkedDir(self): - """ Method to get the location of the folder where Zingg - saves the training data found by findTrainingData + """Method to get the location of the folder where Zingg + saves the training data found by findTrainingData """ return self.args.getZinggTrainingDataUnmarkedDir() - + def getZinggTrainingDataMarkedDir(self): - """ Method to get the location of the folder where Zingg - saves the marked training data labeled by the user + """Method to get the location of the folder where Zingg + saves the marked training data labeled by the user """ return self.args.getZinggTrainingDataMarkedDir() - + def setTrainingSamples(self, *pipes): - """ Method to set existing training samples to be matched. + """Method to set existing training samples to be matched. :param pipes: input training data pipes separated by comma e.g. (pipe1,pipe2,..) :type pipes: Pipe[] @@ -467,19 +571,18 @@ def setTrainingSamples(self, *pipes): self.args.setTrainingSamples(dataPipe) def setModelId(self, id): - """ Method to set the output directory where the match output will be saved + """Method to set the output directory where the match output will be saved - :param id: model id value + :param id: model id value :type id: String """ self.args.setModelId(id) - + def getModelId(self): return self.args.getModelId() - def setZinggDir(self, f): - """ Method to set the location for Zingg to save its internal computations and models. Please set it to a place where the program has to write access. + """Method to set the location for Zingg to save its internal computations and models. Please set it to a place where the program has to write access. :param f: Zingg directory name of the models :type f: String @@ -487,7 +590,7 @@ def setZinggDir(self, f): self.args.setZinggDir(f) def setNumPartitions(self, numPartitions): - """ Method to set NumPartitions parameter value + """Method to set NumPartitions parameter value Sample size to use for seeding labeled data We don't want to run over all the data, as we want a quick way to seed some labeled data that we can manually edit :param numPartitions: number of partitions for given data pipes @@ -496,7 +599,7 @@ def setNumPartitions(self, numPartitions): self.args.setNumPartitions(numPartitions) def setLabelDataSampleSize(self, labelDataSampleSize): - """ Method to set labelDataSampleSize parameter value + """Method to set labelDataSampleSize parameter value Set the fraction of data to be used from the complete data set to be used for seeding the labeled data Labelling is costly and we want a fast approximate way of looking at a small sample of the records and identifying expected matches and nonmatches :param labelDataSampleSize: value between 0.0 and 1.0 denoting portion of dataset to use in generating seed samples @@ -505,7 +608,7 @@ def setLabelDataSampleSize(self, labelDataSampleSize): self.args.setLabelDataSampleSize(labelDataSampleSize) def writeArgumentsToJSON(self, fileName): - """ Method to write JSON file from the object of this class + """Method to write JSON file from the object of this class :param fileName: The CONF parameter value of ClientOption object or file address of json file :type fileName: String @@ -513,16 +616,16 @@ def writeArgumentsToJSON(self, fileName): getJVM().zingg.common.client.ArgumentsUtil().writeArgumentsToJSON(fileName, self.args) def setStopWordsCutoff(self, stopWordsCutoff): - """ Method to set stopWordsCutoff parameter value + """Method to set stopWordsCutoff parameter value By default, Zingg extracts 10% of the high frequency unique words from a dataset. If user wants different selection, they should set up StopWordsCutoff property :param stopWordsCutoff: The stop words cutoff parameter value of ClientOption object or file address of json file :type stopWordsCutoff: float """ self.args.setStopWordsCutoff(stopWordsCutoff) - + def setColumn(self, column): - """ Method to set stopWordsCutoff parameter value + """Method to set stopWordsCutoff parameter value By default, Zingg extracts 10% of the high frequency unique words from a dataset. If user wants different selection, they should set up StopWordsCutoff property :param stopWordsCutoff: The stop words cutoff parameter value of ClientOption object or file address of json file @@ -532,8 +635,8 @@ def setColumn(self, column): @staticmethod def createArgumentsFromJSON(fileName, phase): - """ Method to create an object of this class from the JSON file and phase parameter value. - + """Method to create an object of this class from the JSON file and phase parameter value. + :param fileName: The CONF parameter value of ClientOption object :type fileName: String :param phase: The PHASE parameter value of ClientOption object @@ -544,11 +647,10 @@ def createArgumentsFromJSON(fileName, phase): obj = Arguments() obj.args = getJVM().zingg.common.client.ArgumentsUtil().createArgumentsFromJSON(fileName, phase) return obj - - + def writeArgumentsToJSONString(self): - """ Method to create an object of this class from the JSON file and phase parameter value. - + """Method to create an object of this class from the JSON file and phase parameter value. + :param fileName: The CONF parameter value of ClientOption object :type fileName: String :param phase: The PHASE parameter value of ClientOption object @@ -557,30 +659,27 @@ def writeArgumentsToJSONString(self): :rtype: pointer(Arguments) """ return getJVM().zingg.common.client.ArgumentsUtil().writeArgumentstoJSONString(self.args) - + @staticmethod def createArgumentsFromJSONString(jsonArgs, phase): obj = Arguments() obj.args = getJVM().zingg.common.client.ArgumentsUtil().createArgumentsFromJSONString(jsonArgs, phase) return obj - - + def copyArgs(self, phase): argsString = self.writeArgumentsToJSONString() return self.createArgumentsFromJSONString(argsString, phase) - - - class ClientOptions: - """ Class that contains Client options for Zingg object + """Class that contains Client options for Zingg object :param phase: trainMatch, train, match, link, findAndLabel, findTrainingData, recommend etc :type phase: String :param args: Parse a list of Zingg command line options parameter values e.g. "--location" etc. optional argument for initializing this class. :type args: List(String) or None """ - PHASE = getJVM().zingg.common.client.ClientOptions.PHASE + + PHASE = getJVM().zingg.common.client.ClientOptions.PHASE """:PHASE: phase parameter for this class""" CONF = getJVM().zingg.common.client.ClientOptions.CONF """:CONF: conf parameter for this class""" @@ -601,28 +700,27 @@ class ClientOptions: def __init__(self, argsSent=None): print(argsSent) - if(argsSent == None): + if argsSent == None: args = [] else: args = argsSent.copy() - if (not (self.PHASE in args)): + if self.PHASE not in args: args.append(self.PHASE) args.append("peekModel") - if (not (self.LICENSE in args)): + if self.LICENSE not in args: args.append(self.LICENSE) args.append("zinggLic.txt") - if (not (self.EMAIL in args)): + if self.EMAIL not in args: args.append(self.EMAIL) args.append("zingg@zingg.ai") - if (not (self.CONF in args)): + if self.CONF not in args: args.append(self.CONF) args.append("dummyConf.json") - print("arguments for client options are ", args) + print("arguments for client options are ", args) self.co = getJVM().zingg.common.client.ClientOptions(args) - - + def getClientOptions(self): - """ Method to get pointer address of this class + """Method to get pointer address of this class :return: The pointer containing address of the this class object :rtype: pointer(ClientOptions) @@ -630,17 +728,17 @@ def getClientOptions(self): return self.co def getOptionValue(self, option): - """ Method to get value for the key option + """Method to get value for the key option :param option: key to geting the value :type option: String - :return: The value which is mapped for given key - :rtype: String + :return: The value which is mapped for given key + :rtype: String """ return self.co.getOptionValue(option) def setOptionValue(self, option, value): - """ Method to map option key to the given value + """Method to map option key to the given value :param option: key that is mapped with value :type option: String @@ -650,53 +748,53 @@ def setOptionValue(self, option, value): self.co.get(option).setValue(value) def getPhase(self): - """ Method to get PHASE value + """Method to get PHASE value :return: The PHASE parameter value - :rtype: String + :rtype: String """ return self.co.get(ClientOptions.PHASE).getValue() def setPhase(self, newValue): - """ Method to set PHASE value + """Method to set PHASE value :param newValue: name of the phase :type newValue: String :return: The pointer containing address of the this class object after seting phase - :rtype: pointer(ClientOptions) + :rtype: pointer(ClientOptions) """ self.co.get(ClientOptions.PHASE).setValue(newValue) def getConf(self): - """ Method to get CONF value + """Method to get CONF value :return: The CONF parameter value - :rtype: String + :rtype: String """ return self.co.get(ClientOptions.CONF).getValue() def hasLocation(self): - """ Method to check if this class has LOCATION parameter set as None or not + """Method to check if this class has LOCATION parameter set as None or not :return: The boolean value if LOCATION parameter is present or not - :rtype: Bool + :rtype: Bool """ - if(self.co.get(ClientOptions.LOCATION)==None): + if self.co.get(ClientOptions.LOCATION) == None: return False else: return True def getLocation(self): - """ Method to get LOCATION value + """Method to get LOCATION value :return: The LOCATION parameter value - :rtype: String + :rtype: String """ return self.co.get(ClientOptions.LOCATION).getValue() class FieldDefinition: - """ This class defines each field that we use in matching We can use this to configure the properties of each field we use for matching in Zingg. + """This class defines each field that we use in matching We can use this to configure the properties of each field we use for matching in Zingg. :param name: name of the field :type name: String @@ -712,9 +810,9 @@ def __init__(self, name, dataType, *matchType): self.fd.setDataType(self.stringify(dataType)) self.fd.setMatchType(matchType) self.fd.setFields(name) - + def setStopWords(self, stopWords): - """ Method to add stopwords to this class object + """Method to add stopwords to this class object :param stopWords: The stop Words containing csv file's location :type stopWords: String @@ -722,7 +820,7 @@ def setStopWords(self, stopWords): self.fd.setStopWords(stopWords) def getFieldDefinition(self): - """ Method to get pointer address of this class + """Method to get pointer address of this class :return: The pointer containing the address of this class object :rtype: pointer(FieldDefinition) @@ -731,31 +829,33 @@ def getFieldDefinition(self): # should be stringify'ed before it is set in fd object def stringify(self, str): - """ Method to stringify'ed the dataType before it is set in FieldDefinition object - + """Method to stringify'ed the dataType before it is set in FieldDefinition object + :param str: dataType of the FieldDefinition :type str: String :return: The stringify'ed value of the dataType :rtype: String """ - + return str - + def parseArguments(argv): - """ This method is used for checking mandatory arguments and creating an arguments list from Command line arguments + """This method is used for checking mandatory arguments and creating an arguments list from Command line arguments :param argv: Values that are passed during the calling of the program along with the calling statement. :type argv: List :return: a list containing necessary arguments to run any phase :rtype: List """ - parser = argparse.ArgumentParser(description='Zingg\'s python APIs') - mandatoryOptions = parser.add_argument_group('mandatory arguments') - mandatoryOptions.add_argument('--phase', required=True, - help='python phase e.g. assessModel') - mandatoryOptions.add_argument('--conf', required=True, - help='JSON configuration with data input output locations and field definitions') + parser = argparse.ArgumentParser(description="Zingg's python APIs") + mandatoryOptions = parser.add_argument_group("mandatory arguments") + mandatoryOptions.add_argument("--phase", required=True, help="python phase e.g. assessModel") + mandatoryOptions.add_argument( + "--conf", + required=True, + help="JSON configuration with data input output locations and field definitions", + ) args, remaining_args = parser.parse_known_args(argv) LOG.debug("args: ", args) From 71ae05946d8321fbdd237b393fb6f03a691f1321 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 12 Mar 2024 10:43:07 +0530 Subject: [PATCH 105/219] removed redundant method, causing confusion --- .../main/java/zingg/common/client/FieldDefinition.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/FieldDefinition.java b/common/client/src/main/java/zingg/common/client/FieldDefinition.java index 0adbd9e1a..ada4d07ec 100644 --- a/common/client/src/main/java/zingg/common/client/FieldDefinition.java +++ b/common/client/src/main/java/zingg/common/client/FieldDefinition.java @@ -126,10 +126,10 @@ public void setFieldName(String fieldName) { this.fieldName = fieldName; } - public boolean isDontUse() { - // TODO Auto-generated method stub - throw new UnsupportedOperationException("Unimplemented method 'isDontUse'"); - } +// public boolean isDontUse() { +// // TODO Auto-generated method stub +// throw new UnsupportedOperationException("Unimplemented method 'isDontUse'"); +// } @Override public String getName() { From fbf219c92334a3b964fef82c3b52352690517397 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 12 Mar 2024 10:56:54 +0530 Subject: [PATCH 106/219] making EventsListener thread safe --- .../client/event/listeners/EventsListener.java | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java index d7beb1bdc..de80f4468 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java @@ -5,21 +5,19 @@ import zingg.common.client.util.ListMap; public class EventsListener { - private static EventsListener eventsListener = null; - private final ListMap eventListeners; + private static EventsListener _eventsListener = new EventsListener(); + private final ListMap eventListenersList; private EventsListener() { - eventListeners = new ListMap<>(); + eventListenersList = new ListMap(); } public static EventsListener getInstance() { - if (eventsListener == null) - eventsListener = new EventsListener(); - return eventsListener; + return _eventsListener; } public void addListener(Class eventClass, IEventListener listener) { - eventListeners.add(eventClass.getCanonicalName(), listener); + eventListenersList.add(eventClass.getCanonicalName(), listener); } public void fireEvent(IEvent event) throws ZinggClientException { @@ -28,7 +26,7 @@ public void fireEvent(IEvent event) throws ZinggClientException { private void listen(IEvent event) throws ZinggClientException { Class eventClass = event.getClass(); - for (IEventListener listener : eventListeners.get(eventClass.getCanonicalName())) { + for (IEventListener listener : eventListenersList.get(eventClass.getCanonicalName())) { listener.listen(event); } } From 28070d87ab44feecd620e1a45c3d5ab874287307 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 12 Mar 2024 11:23:46 +0530 Subject: [PATCH 107/219] null checks needed for when event is not registered but gets fired --- .../client/event/listeners/EventsListener.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java index de80f4468..42fccc7f9 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java @@ -1,5 +1,7 @@ package zingg.common.client.event.listeners; +import java.util.List; + import zingg.common.client.ZinggClientException; import zingg.common.client.event.events.IEvent; import zingg.common.client.util.ListMap; @@ -26,8 +28,13 @@ public void fireEvent(IEvent event) throws ZinggClientException { private void listen(IEvent event) throws ZinggClientException { Class eventClass = event.getClass(); - for (IEventListener listener : eventListenersList.get(eventClass.getCanonicalName())) { - listener.listen(event); - } + List listenerList = eventListenersList.get(eventClass.getCanonicalName()); + if (listenerList != null) { + for (IEventListener listener : listenerList) { + if (listener != null) { + listener.listen(event); + } + } + } } } From c4969dad5b2c9a7417b1fe3c4799ed24d00295ce Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 12 Mar 2024 14:20:07 +0530 Subject: [PATCH 108/219] using filter instead of getDupesActualForGraph --- .../client/cols/PredictionColsSelector.java | 23 ++++++++++++ .../zingg/common/core/executor/Linker.java | 19 ++++++---- .../zingg/common/core/executor/Matcher.java | 34 +++++++---------- .../zingg/common/core/filter/IFilter.java | 9 +++++ .../common/core/filter/PredictionFilter.java | 37 +++++++++++++++++++ 5 files changed, 93 insertions(+), 29 deletions(-) create mode 100644 common/client/src/main/java/zingg/common/client/cols/PredictionColsSelector.java create mode 100644 common/core/src/main/java/zingg/common/core/filter/IFilter.java create mode 100644 common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java diff --git a/common/client/src/main/java/zingg/common/client/cols/PredictionColsSelector.java b/common/client/src/main/java/zingg/common/client/cols/PredictionColsSelector.java new file mode 100644 index 000000000..71baf980c --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/cols/PredictionColsSelector.java @@ -0,0 +1,23 @@ +package zingg.common.client.cols; + +import java.util.ArrayList; +import java.util.List; + +import zingg.common.client.util.ColName; + +public class PredictionColsSelector extends SelectedCols { + + public PredictionColsSelector() { + + List cols = new ArrayList(); + cols.add(ColName.ID_COL); + cols.add(ColName.COL_PREFIX + ColName.ID_COL); + cols.add(ColName.PREDICTION_COL); + cols.add(ColName.SCORE_COL); + + setCols(cols); + + } + + +} \ No newline at end of file diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 2dbbaa66c..711788c30 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -5,9 +5,11 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; +import zingg.common.client.cols.PredictionColsSelector; import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; +import zingg.common.core.filter.PredictionFilter; import zingg.common.core.pairs.SelfPairBuilderSourceSensitive; @@ -32,12 +34,20 @@ public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Ex return getPairs(blocked, bAll, new SelfPairBuilderSourceSensitive (getDSUtil(),args)); } + @Override + protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData) throws Exception, ZinggClientException{ + PredictionFilter predictionFilter = new PredictionFilter(); // no input in constructor as all cols need to be returned + SelfPairBuilderSourceSensitive iPairBuilder = new SelfPairBuilderSourceSensitive (getDSUtil(),args); + return getActualDupes(blocked, testData,predictionFilter, iPairBuilder); + } + @Override public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws ZinggClientException { try { // input dupes are pairs /// pick ones according to the threshold by user - ZFrame dupesActual = getDupesActualForGraph(dupes); + PredictionFilter predictionFilter = new PredictionFilter(); + ZFrame dupesActual = predictionFilter.filter(dupes); // all clusters consolidated in one place if (args.getOutput() != null) { @@ -56,11 +66,4 @@ public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws } } - @Override - public ZFrame getDupesActualForGraph(ZFrame dupes) { - ZFrame dupesActual = dupes - .filter(dupes.equalTo(ColName.PREDICTION_COL, ColValues.IS_MATCH_PREDICTION)); - return dupesActual; - } - } diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index c3c23fc89..d7a3e00b1 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -8,12 +8,14 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; +import zingg.common.client.cols.PredictionColsSelector; import zingg.common.client.cols.ZidAndFieldDefSelector; import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; -import zingg.common.client.util.ColValues; import zingg.common.core.block.Canopy; import zingg.common.core.block.Tree; +import zingg.common.core.filter.IFilter; +import zingg.common.core.filter.PredictionFilter; import zingg.common.core.model.Model; import zingg.common.core.pairs.IPairBuilder; import zingg.common.core.pairs.SelfPairBuilder; @@ -78,11 +80,18 @@ protected ZFrame predictOnBlocks(ZFrameblocks) throws Exception, Z } protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData) throws Exception, ZinggClientException{ - ZFrame blocks = getPairs(selectColsFromBlocked(blocked), testData); - ZFramedupesActual = predictOnBlocks(blocks); - return getDupesActualForGraph(dupesActual); + PredictionFilter predictionFilter = new PredictionFilter(new PredictionColsSelector()); + SelfPairBuilder iPairBuilder = new SelfPairBuilder (getDSUtil(),args); + return getActualDupes(blocked, testData,predictionFilter, iPairBuilder); } + protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData, + IFilter predictionFilter, IPairBuilder iPairBuilder) throws Exception, ZinggClientException{ + ZFrame blocks = getPairs(selectColsFromBlocked(blocked), testData, iPairBuilder); + ZFramedupesActual = predictOnBlocks(blocks); + return predictionFilter.filter(dupesActual); + } + @Override public void execute() throws ZinggClientException { try { @@ -253,23 +262,6 @@ protected ZFrame getGraphWithScores(ZFrame graph, ZFrame getDupesActualForGraph(ZFramedupes) { - dupes = selectColsFromDupes(dupes); - LOG.debug("dupes al"); - if (LOG.isDebugEnabled()) dupes.show(); - return dupes.filter(dupes.equalTo(ColName.PREDICTION_COL,ColValues.IS_MATCH_PREDICTION)); - } - - protected ZFrame selectColsFromDupes(ZFramedupesActual) { - List cols = new ArrayList(); - cols.add(dupesActual.col(ColName.ID_COL)); - cols.add(dupesActual.col(ColName.COL_PREFIX + ColName.ID_COL)); - cols.add(dupesActual.col(ColName.PREDICTION_COL)); - cols.add(dupesActual.col(ColName.SCORE_COL)); - ZFrame dupesActual1 = dupesActual.select(cols); //.cache(); - return dupesActual1; - } - protected abstract StopWordsRemover getStopWords(); diff --git a/common/core/src/main/java/zingg/common/core/filter/IFilter.java b/common/core/src/main/java/zingg/common/core/filter/IFilter.java new file mode 100644 index 000000000..70d6b8eec --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/filter/IFilter.java @@ -0,0 +1,9 @@ +package zingg.common.core.filter; + +import zingg.common.client.ZFrame; + +public interface IFilter { + + public ZFrame filter(ZFrame df); + +} diff --git a/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java b/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java new file mode 100644 index 000000000..a825dcadd --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java @@ -0,0 +1,37 @@ +package zingg.common.core.filter; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZFrame; +import zingg.common.client.cols.PredictionColsSelector; +import zingg.common.client.util.ColName; +import zingg.common.client.util.ColValues; + +public class PredictionFilter implements IFilter { + + public static final Log LOG = LogFactory.getLog(PredictionFilter.class); + + protected PredictionColsSelector colsSelector; + + public PredictionFilter() { + super(); + } + + public PredictionFilter(PredictionColsSelector colsSelector) { + super(); + this.colsSelector = colsSelector; + } + + @Override + public ZFrame filter(ZFrame dupes) { + LOG.debug("dupes al"); + if (LOG.isDebugEnabled()) dupes.show(); + dupes = dupes.filter(dupes.equalTo(ColName.PREDICTION_COL,ColValues.IS_MATCH_PREDICTION)); + if(colsSelector!=null) { + dupes = dupes.select(colsSelector.getCols()); + } + return dupes; + } + +} From e4d2cce508318668ee606f9a0bd8c80d86b7ed67 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 12 Mar 2024 14:36:09 +0530 Subject: [PATCH 109/219] refactor to modular methods --- .../zingg/common/core/filter/PredictionFilter.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java b/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java index a825dcadd..3740ff17a 100644 --- a/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java +++ b/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java @@ -25,13 +25,21 @@ public PredictionFilter(PredictionColsSelector colsSelector) { @Override public ZFrame filter(ZFrame dupes) { - LOG.debug("dupes al"); - if (LOG.isDebugEnabled()) dupes.show(); - dupes = dupes.filter(dupes.equalTo(ColName.PREDICTION_COL,ColValues.IS_MATCH_PREDICTION)); + dupes = filterMatches(dupes); + dupes = selectCols(dupes); + return dupes; + } + + protected ZFrame selectCols(ZFrame dupes) { if(colsSelector!=null) { dupes = dupes.select(colsSelector.getCols()); } return dupes; } + protected ZFrame filterMatches(ZFrame dupes) { + dupes = dupes.filter(dupes.equalTo(ColName.PREDICTION_COL,ColValues.IS_MATCH_PREDICTION)); + return dupes; + } + } From 33fc94eabc3d0c444ea63f0feaf58ba668203839 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 15 Mar 2024 08:55:56 +0530 Subject: [PATCH 110/219] separation of filter and select cols --- .../java/zingg/common/core/executor/Linker.java | 4 +--- .../java/zingg/common/core/executor/Matcher.java | 15 ++++++++++++--- .../common/core/filter/PredictionFilter.java | 16 ---------------- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 711788c30..2651af982 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -5,10 +5,8 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; -import zingg.common.client.cols.PredictionColsSelector; import zingg.common.client.options.ZinggOptions; import zingg.common.client.util.ColName; -import zingg.common.client.util.ColValues; import zingg.common.core.filter.PredictionFilter; import zingg.common.core.pairs.SelfPairBuilderSourceSensitive; @@ -36,7 +34,7 @@ public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Ex @Override protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData) throws Exception, ZinggClientException{ - PredictionFilter predictionFilter = new PredictionFilter(); // no input in constructor as all cols need to be returned + PredictionFilter predictionFilter = new PredictionFilter(); SelfPairBuilderSourceSensitive iPairBuilder = new SelfPairBuilderSourceSensitive (getDSUtil(),args); return getActualDupes(blocked, testData,predictionFilter, iPairBuilder); } diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index d7a3e00b1..5478045fb 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -80,16 +80,25 @@ protected ZFrame predictOnBlocks(ZFrameblocks) throws Exception, Z } protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData) throws Exception, ZinggClientException{ - PredictionFilter predictionFilter = new PredictionFilter(new PredictionColsSelector()); + PredictionFilter predictionFilter = new PredictionFilter(); SelfPairBuilder iPairBuilder = new SelfPairBuilder (getDSUtil(),args); - return getActualDupes(blocked, testData,predictionFilter, iPairBuilder); + return getActualDupes(blocked, testData,predictionFilter, iPairBuilder,new PredictionColsSelector()); } protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData, IFilter predictionFilter, IPairBuilder iPairBuilder) throws Exception, ZinggClientException{ + return getActualDupes(blocked,testData,predictionFilter,iPairBuilder,null); + } + + protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData, + IFilter predictionFilter, IPairBuilder iPairBuilder, PredictionColsSelector colsSelector) throws Exception, ZinggClientException{ ZFrame blocks = getPairs(selectColsFromBlocked(blocked), testData, iPairBuilder); ZFramedupesActual = predictOnBlocks(blocks); - return predictionFilter.filter(dupesActual); + ZFrame filteredData = predictionFilter.filter(dupesActual); + if(colsSelector!=null) { + filteredData = filteredData.select(colsSelector.getCols()); + } + return filteredData; } @Override diff --git a/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java b/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java index 3740ff17a..8affb1f76 100644 --- a/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java +++ b/common/core/src/main/java/zingg/common/core/filter/PredictionFilter.java @@ -4,7 +4,6 @@ import org.apache.commons.logging.LogFactory; import zingg.common.client.ZFrame; -import zingg.common.client.cols.PredictionColsSelector; import zingg.common.client.util.ColName; import zingg.common.client.util.ColValues; @@ -12,28 +11,13 @@ public class PredictionFilter implements IFilter { public static final Log LOG = LogFactory.getLog(PredictionFilter.class); - protected PredictionColsSelector colsSelector; - public PredictionFilter() { super(); } - public PredictionFilter(PredictionColsSelector colsSelector) { - super(); - this.colsSelector = colsSelector; - } - @Override public ZFrame filter(ZFrame dupes) { dupes = filterMatches(dupes); - dupes = selectCols(dupes); - return dupes; - } - - protected ZFrame selectCols(ZFrame dupes) { - if(colsSelector!=null) { - dupes = dupes.select(colsSelector.getCols()); - } return dupes; } From 3dec5784953de8ff6595904dafac2b573a663d48 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 15 Mar 2024 09:27:19 +0530 Subject: [PATCH 111/219] clean up extra method --- .../src/main/java/zingg/common/core/executor/Linker.java | 2 +- .../src/main/java/zingg/common/core/executor/Matcher.java | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index 2651af982..d63526410 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -36,7 +36,7 @@ public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Ex protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData) throws Exception, ZinggClientException{ PredictionFilter predictionFilter = new PredictionFilter(); SelfPairBuilderSourceSensitive iPairBuilder = new SelfPairBuilderSourceSensitive (getDSUtil(),args); - return getActualDupes(blocked, testData,predictionFilter, iPairBuilder); + return getActualDupes(blocked, testData,predictionFilter, iPairBuilder, null); } @Override diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 5478045fb..483059c4d 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -85,11 +85,6 @@ protected ZFrame getActualDupes(ZFrame blocked, ZFrame test return getActualDupes(blocked, testData,predictionFilter, iPairBuilder,new PredictionColsSelector()); } - protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData, - IFilter predictionFilter, IPairBuilder iPairBuilder) throws Exception, ZinggClientException{ - return getActualDupes(blocked,testData,predictionFilter,iPairBuilder,null); - } - protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData, IFilter predictionFilter, IPairBuilder iPairBuilder, PredictionColsSelector colsSelector) throws Exception, ZinggClientException{ ZFrame blocks = getPairs(selectColsFromBlocked(blocked), testData, iPairBuilder); From d2730de84cb5b8af5c124ce7f21d7857852edbda Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 15 Mar 2024 11:00:04 +0530 Subject: [PATCH 112/219] moved init listener events to init method --- common/client/src/main/java/zingg/common/client/Client.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 43f27d753..05fb60d64 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -214,8 +214,6 @@ else if (options.get(ClientOptions.CONF).value.endsWith("env")) { client = getClient(arguments, options); client.init(); // after setting arguments etc. as some of the listeners need it - initializeListeners(); - EventsListener.getInstance().fireEvent(new ZinggStartEvent()); client.execute(); client.postMetrics(); LOG.warn("Zingg processing has completed"); @@ -263,7 +261,8 @@ else if (options.get(ClientOptions.CONF).value.endsWith("env")) { public void init() throws ZinggClientException { zingg.init(getArguments(), getSession()); if (session != null) zingg.setSession(session); - + initializeListeners(); + EventsListener.getInstance().fireEvent(new ZinggStartEvent()); } /** From 867de3cd3939304d7d22ca0de93eba02b47fd605 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 20 Mar 2024 10:56:26 +0530 Subject: [PATCH 113/219] Update error msg in StopWordsRecommender --- .../zingg/common/core/recommender/StopWordsRecommender.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java index 8d63519fc..2dacbbc48 100644 --- a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java +++ b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java @@ -51,7 +51,7 @@ public void createStopWordsDocuments(ZFrame data, String fieldName) throw data = findStopWords(data, fieldName); context.getPipeUtil().write(data, context.getPipeUtil().getStopWordsPipe(args, filenameCSV)); } else { - LOG.info("An invalid column name - " + args.getColumn() + " entered. Please provide valid column name."); + LOG.info("An invalid column name - " + args.getColumn() + " entered. Please provide valid column name, as per the config (they are case sensitive)"); } } else { LOG.info("Please provide '--column ' option at command line to generate stop words for that column."); From 9520ed25ca569ec831b4d3401e1d833eb81b36c6 Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 20 Mar 2024 10:58:56 +0530 Subject: [PATCH 114/219] Update error msg in StopWordsRecommender --- .../zingg/common/core/recommender/StopWordsRecommender.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java index 2dacbbc48..b09ac7556 100644 --- a/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java +++ b/common/core/src/main/java/zingg/common/core/recommender/StopWordsRecommender.java @@ -51,7 +51,7 @@ public void createStopWordsDocuments(ZFrame data, String fieldName) throw data = findStopWords(data, fieldName); context.getPipeUtil().write(data, context.getPipeUtil().getStopWordsPipe(args, filenameCSV)); } else { - LOG.info("An invalid column name - " + args.getColumn() + " entered. Please provide valid column name, as per the config (they are case sensitive)"); + LOG.info("An invalid column name - " + args.getColumn() + " entered. Please provide valid column name, as per the config (they are case sensitive)."); } } else { LOG.info("Please provide '--column ' option at command line to generate stop words for that column."); From a3447c59b23ea95e08df7769c1aca3e38e8cf3da Mon Sep 17 00:00:00 2001 From: gnanaprakash-ravi Date: Wed, 20 Mar 2024 22:40:55 +0530 Subject: [PATCH 115/219] Remove unnecessary msg in listener --- .../zingg/common/client/event/listeners/ZinggStartListener.java | 1 - .../zingg/common/client/event/listeners/ZinggStopListener.java | 1 - 2 files changed, 2 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java index 5cf9fc4cd..06ed396c7 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStartListener.java @@ -6,7 +6,6 @@ public class ZinggStartListener extends IEventListener { @Override public void listen(IEvent event) { - System.out.println("ZinggStartListener: I am listening"); } } diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java index e5611fbd1..9d161dfb9 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/ZinggStopListener.java @@ -6,6 +6,5 @@ public class ZinggStopListener extends IEventListener { @Override public void listen(IEvent event) { - System.out.println("ZinggStopListener: I am listening"); } } From b8041c9014125e1365305b55cee1eb72e594f14c Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Thu, 4 Apr 2024 20:59:08 +0200 Subject: [PATCH 116/219] Initial work on the spark-conect compatible flow On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: python/requirements.txt new file: python/zingg_v2/__init__.py new file: python/zingg_v2/client.py new file: python/zingg_v2/errors.py new file: python/zingg_v2/structs.py --- python/requirements.txt | 2 +- python/zingg_v2/__init__.py | 0 python/zingg_v2/client.py | 155 ++++++++++++++++++++++++++++++++++++ python/zingg_v2/errors.py | 2 + python/zingg_v2/structs.py | 83 +++++++++++++++++++ 5 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 python/zingg_v2/__init__.py create mode 100644 python/zingg_v2/client.py create mode 100644 python/zingg_v2/errors.py create mode 100644 python/zingg_v2/structs.py diff --git a/python/requirements.txt b/python/requirements.txt index 906e14e38..0d786b720 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -3,4 +3,4 @@ seaborn matplotlib sphinx sphinx-rtd-theme -pyspark +pyspark>=3.5 diff --git a/python/zingg_v2/__init__.py b/python/zingg_v2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py new file mode 100644 index 000000000..b2ef21dcc --- /dev/null +++ b/python/zingg_v2/client.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from collections.abc import Sequence +from typing import Optional, Union + +from pyspark.sql import SparkSession + +from . import structs +from .errors import JobBuilderNotInitialized + + +class ZinggJobBuilder: + def __init__(self) -> None: + self._fields: list[structs.ZinggField] = [] + self._data: list[structs.ZinggData] = [] + self._output: list[structs.ZinggData] = [] + self._label_data_sample_size: Optional[float] = None + self._num_partitions: Optional[int] = None + self._model_id: Optional[int] = None + self._zingg_dir: Optional[str] = None + self._job_type: structs.ZinggJobType = structs.ZinggJobType.SPARK + self._job_params: Optional[Union[structs.ZinggBigQueryParams, structs.ZinggSnowFlakeParams]] = None + self._job_definition: Optional[structs.ZinggJobDefinition] = None + + def add_field(self, field: structs.ZinggField) -> "ZinggJobBuilder": + self._fields.append(field) + return self + + def set_fields(self, fields: Sequence[structs.ZinggField]) -> "ZinggJobBuilder": + self._fields = [f for f in fields] + return self + + def get_fields(self) -> tuple[structs.ZinggField, ...]: + return tuple(self._fields) + + def add_data(self, data: structs.ZinggData) -> "ZinggJobBuilder": + self._data.append(data) + return self + + def set_data(self, data: Sequence[structs.ZinggData]) -> "ZinggJobBuilder": + self._data = [d for d in data] + return self + + def get_data(self) -> tuple[structs.ZinggData, ...]: + return tuple(self._data) + + def set_label_data_sample_size(self, label_sample_size: float) -> "ZinggJobBuilder": + self._label_data_sample_size = label_sample_size + return self + + def get_label_data_sample_size(self) -> float: + if self._label_data_sample_size is None: + raise JobBuilderNotInitialized("Label data sample size is not set") + else: + return self._label_data_sample_size + + def set_num_partitions(self, num_partitions: int) -> "ZinggJobBuilder": + self._num_partitions = num_partitions + return self + + def get_num_partitions(self) -> int: + if self._num_partitions is None: + raise JobBuilderNotInitialized("Num partitions is not set") + else: + return self._num_partitions + + def set_model_id(self, model_id: int) -> "ZinggJobBuilder": + self._model_id = model_id + return self + + def get_model_id(self) -> int: + if self._model_id is None: + raise JobBuilderNotInitialized("Model ID is not set") + else: + return self._model_id + + def set_zingg_dir(self, zingg_dir: str) -> "ZinggJobBuilder": + self._zingg_dir = zingg_dir + return self + + def get_zingg_dir(self) -> str: + if self._zingg_dir is None: + raise JobBuilderNotInitialized("Zingg Directory is not set") + else: + return self._zingg_dir + + def set_job_type(self, job_type: structs.ZinggJobType) -> "ZinggJobBuilder": + self._job_type = job_type + return self + + def get_job_type(self) -> structs.ZinggJobType: + return self._job_type + + def set_params(self, params: Union[structs.ZinggSnowFlakeParams, structs.ZinggBigQueryParams]) -> "ZinggJobBuilder": + if isinstance(params, structs.ZinggSnowFlakeParams) and (self._job_type != structs.ZinggJobType.SNOWFLAKE): + print(f"Warning! You are trying to add Snowflake parameters, but the current type of the job is {self._job_type}!") + + if isinstance(params, structs.ZinggBigQueryParams) and (self._job_type != structs.ZinggJobType.BIG_QUERY): + print(f"Warning! You are trying to add BigQuery parameters, but the current type of the job is {self._job_type}!") + + self._job_params = params + return self + + def get_params(self) -> Optional[Union[structs.ZinggSnowFlakeParams, structs.ZinggBigQueryParams]]: + return self._job_params + + def build_job_definition(self) -> structs.ZinggJobDefinition: + if not self._is_initialized(): + err_msg = "Job is not properly initialized." + err_msg += "\n\tCheck that zingg dir, model_id, num_partitions and label sample size are set" + err_msg += "\n\tIf job type is BigQuery or Snowflake, check that corresponding params are set" + raise JobBuilderNotInitialized(err_msg) + + return structs.ZinggJobDefinition( + job_type=self._job_type, + fields_definition=self._fields, + output=self._output, + data=self._data, + label_sample_size=self._label_data_sample_size, + num_partitions=self._num_partitions, + model_id=self._model_id, + zingg_dir=self._zingg_dir, + job_params=self._job_params, + ) + + def _is_initialized(self) -> bool: + res = True + res &= self._label_data_sample_size is not None + res &= self._num_partitions is not None + res &= self._model_id is not None + res &= self._zingg_dir is not None + res &= len(self._data) >= 1 + res &= len(self._output) >= 1 + res &= len(self._fields) >= 1 + + if self._job_type == structs.ZinggJobType.BIG_QUERY: + res &= self._job_params is not None + res &= isinstance(self._job_params, structs.ZinggBigQueryParams) + + if self._job_type == structs.ZinggJobType.SNOWFLAKE: + res &= self._job_params is not None + res &= isinstance(self._job_params, structs.ZinggSnowFlakeParams) + + return res + + +def run_zingg_job(job_definition: structs.ZinggJobDefinition, spark: SparkSession) -> None: + is_spark_connect = hasattr(spark, "_jvm") + + if not is_spark_connect: + raise NotImplementedError() + # TODO: implemnt spark classic pipe generation from JobDefinition + else: + raise NotImplementedError() + # TODO: call Zingg on a side of SparkConnect Server by passing parameters from PySpark via command diff --git a/python/zingg_v2/errors.py b/python/zingg_v2/errors.py new file mode 100644 index 000000000..7e1b60793 --- /dev/null +++ b/python/zingg_v2/errors.py @@ -0,0 +1,2 @@ +class JobBuilderNotInitialized(ValueError): + pass diff --git a/python/zingg_v2/structs.py b/python/zingg_v2/structs.py new file mode 100644 index 000000000..a53eeb4fb --- /dev/null +++ b/python/zingg_v2/structs.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import StrEnum, auto +from typing import Optional, Union + +from pyspark.sql.types import DataType, StructType + + +class ZinggMatchType(StrEnum): + FUZZY = auto() + EXACT = auto() + DONT_USE = auto() + EMAIL = auto() + PINCODE = auto() + NULL_OR_BLANK = auto() + TEXT = auto() + NUMERIC = auto() + NUMERIC_WITH_UNITS = auto() + ONLY_ALPHABETS_EXACT = auto() + ONLY_ALPHABETS_FUZZY = auto() + + +class ZinggFormatType(StrEnum): + CSV = auto() + PARQUET = auto() + ORC = auto() + JDBC = auto() + AVRO = auto() + + +class ZinggJobType(StrEnum): + SPARK = auto() + BIG_QUERY = auto() + SNOWFLAKE = auto() + + +@dataclass +class ZinggField: + filed_name: str + fields: list[str] + data_type: DataType + match_type: ZinggMatchType + + +@dataclass +class ZinggData: + name: str + format: ZinggFormatType + props: dict[str, str] + schema: StructType + + +@dataclass +class ZinggBigQueryParams: + views_enabled: bool + credential_file: str + table: str + temp_gcs_bucket: str + + +@dataclass +class ZinggSnowFlakeParams: + url: str + user: str + password: str + database: str + schema: str + warehouse: str + dbtable: str + + +@dataclass +class ZinggJobDefinition: + job_type: ZinggJobType + fields_definition: list[ZinggField] + output: list[ZinggData] + data: list[ZinggData] + label_sample_size: float + num_partitions: int + model_id: int + zingg_dir: str + job_params: Optional[Union[ZinggSnowFlakeParams, ZinggBigQueryParams]] = None From 7ab47be64a476b81bd450cc96e5c4a5e15c1c642 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Sat, 6 Apr 2024 16:03:53 +0200 Subject: [PATCH 117/219] Update On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: python/zingg_v2/structs.py --- python/zingg_v2/structs.py | 147 +++++++++++++++++++++++-------------- 1 file changed, 91 insertions(+), 56 deletions(-) diff --git a/python/zingg_v2/structs.py b/python/zingg_v2/structs.py index a53eeb4fb..298947abc 100644 --- a/python/zingg_v2/structs.py +++ b/python/zingg_v2/structs.py @@ -1,10 +1,10 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import asdict, dataclass, fields from enum import StrEnum, auto -from typing import Optional, Union +from typing import Any, Optional, Sequence, Union -from pyspark.sql.types import DataType, StructType +from pandas.core.frame import itertools class ZinggMatchType(StrEnum): @@ -21,63 +21,98 @@ class ZinggMatchType(StrEnum): ONLY_ALPHABETS_FUZZY = auto() -class ZinggFormatType(StrEnum): - CSV = auto() - PARQUET = auto() - ORC = auto() - JDBC = auto() - AVRO = auto() +class FieldDefinition: + def __init__(self, name: str, dataType: str, *matchType: Union[str, ZinggMatchType]) -> None: + self.name = name + self.dataType = dataType + self.match_types = [] + for mt in matchType: + if isinstance(mt, str): + mt = ZinggMatchType(mt) + self.match_types.append(mt) -class ZinggJobType(StrEnum): - SPARK = auto() - BIG_QUERY = auto() - SNOWFLAKE = auto() + self.stopwords: Optional[str] = None + def setStopWords(self, stopWords: str) -> None: + self.stopwords = stopWords -@dataclass -class ZinggField: - filed_name: str - fields: list[str] - data_type: DataType - match_type: ZinggMatchType - - -@dataclass -class ZinggData: - name: str - format: ZinggFormatType - props: dict[str, str] - schema: StructType - - -@dataclass -class ZinggBigQueryParams: - views_enabled: bool - credential_file: str - table: str - temp_gcs_bucket: str - - -@dataclass -class ZinggSnowFlakeParams: - url: str - user: str - password: str - database: str - schema: str - warehouse: str - dbtable: str + def getFieldDefinition(self) -> Any: + # TODO: imeplement it + # A single point where all the interactions with JVM should be + raise NotImplementedError() @dataclass -class ZinggJobDefinition: - job_type: ZinggJobType - fields_definition: list[ZinggField] - output: list[ZinggData] - data: list[ZinggData] - label_sample_size: float - num_partitions: int - model_id: int - zingg_dir: str - job_params: Optional[Union[ZinggSnowFlakeParams, ZinggBigQueryParams]] = None +class ClientOptionsV2: + phase: str = "peekModel" + license: str = "zinggLic.txt" + email: str = "zingg@zingg.ai" + conf: str = "dummyConf.json" + preprocess: Optional[str] = None + jobId: Optional[str] = None + format: Optional[str] = None + zinggDir: Optional[str] = None + modelId: Optional[str] = None + collectMetrics: Optional[str] = None + showConcise: Optional[str] = None + location: Optional[str] = None + column: Optional[str] = None + remote: Optional[str] = None + + def to_java_args(self) -> list[str]: + return list(itertools.chain.from_iterable([[f"--{key}", value] for key, value in asdict(self) if value is not None])) + + +class ClientOptions: + def __init__(self, argsSent: Optional[Sequence[str]]) -> None: + if argsSent is None: + args = [] + else: + args = [a for a in argsSent] + + self._opt_v2 = ClientOptionsV2(**{k: v for k, v in zip(args[:-1], args[1:])}) + print("arguments for client options are ", self._opt_v2.to_java_args()) + + def getClientOptions(self): + java_args = self._opt_v2.to_java_args() + # TODO: implement it by passing options ot JVM + # A single point where all the interactions with JVM should be + raise NotImplementedError() + + def getOptionValue(self, option: str) -> str: + if option.startswith("--"): + option = option[2:] + + if not hasattr(self._opt_v2, option): + _msg = "Wrong option; possible options are: " + _msg += ", ".join(f.name for f in fields(self._opt_v2)) + raise KeyError(_msg) + + return getattr(self._opt_v2, option) + + def setOptionValue(self, option: str, value: str) -> None: + if option.startswith("--"): + option = option[2:] + + if not hasattr(self._opt_v2, option): + _msg = "Wrong option; possible options are: " + _msg += ", ".join(f.name for f in fields(self._opt_v2)) + raise KeyError(_msg) + + setattr(self._opt_v2, option, value) + + def getPhase(self) -> str: + return self._opt_v2.phase + + def setPhase(self, newValue: str) -> None: + self._opt_v2.phase = newValue + + def getConf(self) -> str: + return self._opt_v2.conf + + def hasLocation(self) -> bool: + return self._opt_v2.location is None + + def getLocation(self) -> Optional[str]: + return self._opt_v2.location From 25f6ccd7021b6b86a0c66b70b9b332f3961603d5 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Mon, 8 Apr 2024 21:12:55 +0200 Subject: [PATCH 118/219] Update the new implementation On branch main Your branch is up to date with 'origin/main'. Changes to be committed: deleted: python/zingg_v2/client.py new file: python/zingg_v2/pipes.py modified: python/zingg_v2/structs.py --- python/zingg_v2/client.py | 155 ------------------------------------- python/zingg_v2/pipes.py | 124 +++++++++++++++++++++++++++++ python/zingg_v2/structs.py | 18 ++++- 3 files changed, 141 insertions(+), 156 deletions(-) delete mode 100644 python/zingg_v2/client.py create mode 100644 python/zingg_v2/pipes.py diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py deleted file mode 100644 index b2ef21dcc..000000000 --- a/python/zingg_v2/client.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import annotations - -from collections.abc import Sequence -from typing import Optional, Union - -from pyspark.sql import SparkSession - -from . import structs -from .errors import JobBuilderNotInitialized - - -class ZinggJobBuilder: - def __init__(self) -> None: - self._fields: list[structs.ZinggField] = [] - self._data: list[structs.ZinggData] = [] - self._output: list[structs.ZinggData] = [] - self._label_data_sample_size: Optional[float] = None - self._num_partitions: Optional[int] = None - self._model_id: Optional[int] = None - self._zingg_dir: Optional[str] = None - self._job_type: structs.ZinggJobType = structs.ZinggJobType.SPARK - self._job_params: Optional[Union[structs.ZinggBigQueryParams, structs.ZinggSnowFlakeParams]] = None - self._job_definition: Optional[structs.ZinggJobDefinition] = None - - def add_field(self, field: structs.ZinggField) -> "ZinggJobBuilder": - self._fields.append(field) - return self - - def set_fields(self, fields: Sequence[structs.ZinggField]) -> "ZinggJobBuilder": - self._fields = [f for f in fields] - return self - - def get_fields(self) -> tuple[structs.ZinggField, ...]: - return tuple(self._fields) - - def add_data(self, data: structs.ZinggData) -> "ZinggJobBuilder": - self._data.append(data) - return self - - def set_data(self, data: Sequence[structs.ZinggData]) -> "ZinggJobBuilder": - self._data = [d for d in data] - return self - - def get_data(self) -> tuple[structs.ZinggData, ...]: - return tuple(self._data) - - def set_label_data_sample_size(self, label_sample_size: float) -> "ZinggJobBuilder": - self._label_data_sample_size = label_sample_size - return self - - def get_label_data_sample_size(self) -> float: - if self._label_data_sample_size is None: - raise JobBuilderNotInitialized("Label data sample size is not set") - else: - return self._label_data_sample_size - - def set_num_partitions(self, num_partitions: int) -> "ZinggJobBuilder": - self._num_partitions = num_partitions - return self - - def get_num_partitions(self) -> int: - if self._num_partitions is None: - raise JobBuilderNotInitialized("Num partitions is not set") - else: - return self._num_partitions - - def set_model_id(self, model_id: int) -> "ZinggJobBuilder": - self._model_id = model_id - return self - - def get_model_id(self) -> int: - if self._model_id is None: - raise JobBuilderNotInitialized("Model ID is not set") - else: - return self._model_id - - def set_zingg_dir(self, zingg_dir: str) -> "ZinggJobBuilder": - self._zingg_dir = zingg_dir - return self - - def get_zingg_dir(self) -> str: - if self._zingg_dir is None: - raise JobBuilderNotInitialized("Zingg Directory is not set") - else: - return self._zingg_dir - - def set_job_type(self, job_type: structs.ZinggJobType) -> "ZinggJobBuilder": - self._job_type = job_type - return self - - def get_job_type(self) -> structs.ZinggJobType: - return self._job_type - - def set_params(self, params: Union[structs.ZinggSnowFlakeParams, structs.ZinggBigQueryParams]) -> "ZinggJobBuilder": - if isinstance(params, structs.ZinggSnowFlakeParams) and (self._job_type != structs.ZinggJobType.SNOWFLAKE): - print(f"Warning! You are trying to add Snowflake parameters, but the current type of the job is {self._job_type}!") - - if isinstance(params, structs.ZinggBigQueryParams) and (self._job_type != structs.ZinggJobType.BIG_QUERY): - print(f"Warning! You are trying to add BigQuery parameters, but the current type of the job is {self._job_type}!") - - self._job_params = params - return self - - def get_params(self) -> Optional[Union[structs.ZinggSnowFlakeParams, structs.ZinggBigQueryParams]]: - return self._job_params - - def build_job_definition(self) -> structs.ZinggJobDefinition: - if not self._is_initialized(): - err_msg = "Job is not properly initialized." - err_msg += "\n\tCheck that zingg dir, model_id, num_partitions and label sample size are set" - err_msg += "\n\tIf job type is BigQuery or Snowflake, check that corresponding params are set" - raise JobBuilderNotInitialized(err_msg) - - return structs.ZinggJobDefinition( - job_type=self._job_type, - fields_definition=self._fields, - output=self._output, - data=self._data, - label_sample_size=self._label_data_sample_size, - num_partitions=self._num_partitions, - model_id=self._model_id, - zingg_dir=self._zingg_dir, - job_params=self._job_params, - ) - - def _is_initialized(self) -> bool: - res = True - res &= self._label_data_sample_size is not None - res &= self._num_partitions is not None - res &= self._model_id is not None - res &= self._zingg_dir is not None - res &= len(self._data) >= 1 - res &= len(self._output) >= 1 - res &= len(self._fields) >= 1 - - if self._job_type == structs.ZinggJobType.BIG_QUERY: - res &= self._job_params is not None - res &= isinstance(self._job_params, structs.ZinggBigQueryParams) - - if self._job_type == structs.ZinggJobType.SNOWFLAKE: - res &= self._job_params is not None - res &= isinstance(self._job_params, structs.ZinggSnowFlakeParams) - - return res - - -def run_zingg_job(job_definition: structs.ZinggJobDefinition, spark: SparkSession) -> None: - is_spark_connect = hasattr(spark, "_jvm") - - if not is_spark_connect: - raise NotImplementedError() - # TODO: implemnt spark classic pipe generation from JobDefinition - else: - raise NotImplementedError() - # TODO: call Zingg on a side of SparkConnect Server by passing parameters from PySpark via command diff --git a/python/zingg_v2/pipes.py b/python/zingg_v2/pipes.py new file mode 100644 index 000000000..59e3213ef --- /dev/null +++ b/python/zingg_v2/pipes.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import json +import warnings +from typing import Optional, Union + +from pandas import DataFrame as PDataFrame +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import StructType + +from zingg_v2.structs import ZinggFileFormat + + +class Pipe: + def __init__(self, name: str, format: Union[str, ZinggFileFormat]) -> None: + self.name = name + if not isinstance(format, ZinggFileFormat): + format = ZinggFileFormat(format) + self.format = format + self.properties: dict[str, str] = {} + self.schema: Optional[str] = None + + def getPipe(self): + # TODO: implement it + raise NotImplementedError() + + def addProperty(self, name: str, value: str) -> None: + self.properties[name] = value + + def setSchema(self, schema: str) -> None: + self.schema = schema + + def toString(self) -> str: + return json.dumps({"name": self.name, "format": self.format, "schema": self.schema, "properties": json.dumps(self.properties)}) + + +class CsvPipe(Pipe): + def __init__(self, name: str, location: Optional[str] = None, schema: Optional[str] = None) -> None: + super().__init__(name, ZinggFileFormat.CSV) + if schema is not None: + self.setSchema(schema) + if location is not None: + self.addProperty("location", location) + + def setDelimiter(self, delimiter: str) -> None: + self.addProperty("delimiter", delimiter) + + def setLocation(self, location: str) -> None: + self.addProperty("location", location) + + def setHeader(self, header: str) -> None: + self.addProperty("header", header) + + +class BigQueryPipe(Pipe): + def __init__(self, name: str) -> None: + super().__init__(name, ZinggFileFormat.BIGQUERY) + + def setCredentialFile(self, credentials_file: str) -> None: + self.addProperty("credentialsFile", credentials_file) + + def setTable(self, table: str) -> None: + self.addProperty("table", table) + + def setTemporaryGcsBucket(self, bucket: str) -> None: + self.addProperty("temporaryGcsBucket", bucket) + + def setViewsEnabled(self, isEnabled: bool) -> None: + self.addProperty("viewsEnabled", "true" if isEnabled else "false") + + +class SnowflakePipe(Pipe): + def __init__(self, name: str) -> None: + super().__init__(name, ZinggFileFormat.SNOWFLAKE) + self.addProperty("application", "zinggai_zingg") + + def setUrl(self, url: str) -> None: + self.addProperty("sfUrl", url) + + def setUser(self, user: str) -> None: + self.addProperty("sfUser", user) + + def setPassword(self, passwd: str) -> None: + self.addProperty("sfPassword", passwd) + + def setDatabase(self, db: str) -> None: + self.addProperty("sfDatabase", db) + + def setSFSchema(self, schema: str) -> None: + self.addProperty("sfSchema", schema) + + def setWarehouse(self, warehouse: str) -> None: + self.addProperty("sfWarehouse", warehouse) + + def setDbTable(self, dbtable: str) -> None: + self.addProperty("dbtable", dbtable) + + +class InMemoryPipe(Pipe): + def __init__(self, name: str, df: Optional[Union[DataFrame, PDataFrame]] = None) -> None: + super().__init__(name, ZinggFileFormat.INMEMORY) + self.df: Optional[DataFrame] = None + if df is not None: + self.setDataset(df) + + def setDataset(self, df: Union[DataFrame, PDataFrame]) -> None: + if isinstance(df, PDataFrame): + spark = SparkSession.getActiveSession() + if spark is None: + warnings.warn("No active Session Found!") + spark = SparkSession.builder.getOrCreate() + + if self.schema is None: + df = spark.createDataFrame(df) + else: + df = spark.createDataFrame(df, schema=StructType.fromJson(json.loads(self.schema))) + + self.df = df + + def getDataset(self) -> DataFrame: + if self.df is None: + raise ValueError("DataFrame is not set!") + + return self.df diff --git a/python/zingg_v2/structs.py b/python/zingg_v2/structs.py index 298947abc..26be7cf5e 100644 --- a/python/zingg_v2/structs.py +++ b/python/zingg_v2/structs.py @@ -21,13 +21,29 @@ class ZinggMatchType(StrEnum): ONLY_ALPHABETS_FUZZY = auto() +class ZinggFileFormat(StrEnum): + CSV = auto() + PARQUET = auto() + JSON = auto() + TEXT = auto() + XLS = "com.crealytics.spark.excel" + AVRO = auto() + JDBC = auto() + CASSANDRA = "org.apache.spark.sql.cassandra" + SNOWFLAKE = "net.snowflake.spark.snowflake" + ELASTIC = "org.elasticsearch.spark.sql" + EXACOL = "com.exasol.spark" + BIGQUERY = auto() + INMEMORY = auto() + + class FieldDefinition: def __init__(self, name: str, dataType: str, *matchType: Union[str, ZinggMatchType]) -> None: self.name = name self.dataType = dataType self.match_types = [] for mt in matchType: - if isinstance(mt, str): + if not isinstance(mt, ZinggMatchType): mt = ZinggMatchType(mt) self.match_types.append(mt) From d0c0aed564f56cd821348f699497de833f28bd1a Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Tue, 9 Apr 2024 17:53:58 +0200 Subject: [PATCH 119/219] Update the implementation DRAFT On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: python/pyproject.toml new file: python/zingg_v2/client.py modified: python/zingg_v2/errors.py new file: python/zingg_v2/models.py modified: python/zingg_v2/pipes.py deleted: python/zingg_v2/structs.py --- python/pyproject.toml | 19 +++ python/zingg_v2/client.py | 257 +++++++++++++++++++++++++++++++++++++ python/zingg_v2/errors.py | 5 +- python/zingg_v2/models.py | 133 +++++++++++++++++++ python/zingg_v2/pipes.py | 35 +++-- python/zingg_v2/structs.py | 134 ------------------- 6 files changed, 430 insertions(+), 153 deletions(-) create mode 100644 python/zingg_v2/client.py create mode 100644 python/zingg_v2/models.py delete mode 100644 python/zingg_v2/structs.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 16637529c..40a91838f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,2 +1,21 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "zingg" +dynamic = ["version"] +dependencies = [ + "pandas", + "seaborn", + "matplotlib", + "sphinx", + "sphinx-rtd-theme", + "pyspark>=3.5", + "pydantic", +] +readme = "README.md" +requires-python = ">=3.11" + [tool.ruff] line-length = 150 diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py new file mode 100644 index 000000000..863bcf236 --- /dev/null +++ b/python/zingg_v2/client.py @@ -0,0 +1,257 @@ +from __future__ import annotations + +import json +import os +from dataclasses import fields +from typing import Optional, Sequence, Union + +from zingg_v2 import models as models_v2 +from zingg_v2.errors import ZinggArgumentsValidationError +from zingg_v2.pipes import Pipe + + +class Zingg: + def __init__(self, args: Arguments, options: ClientOptions) -> None: + self.args = args + self.options = options + + def execute(self) -> None: + # TODO: implement it + # java_args: arguments in form of string + # that is pairs of --key value + java_args = self.options.getClientOptions() + + # java_arguments is JSON definition of Zingg Job + java_arguments = self.args.writeArgumentsToJSONString() + + raise NotImplementedError() + + def executeLabel(self) -> None: + raise NotImplementedError() + + def executeLabelUpdate(self) -> None: + raise NotImplementedError() + + def getMarkedRecords(self) -> None: + raise NotImplementedError() + + def getUnmarkedRecords(self) -> None: + raise NotImplementedError() + + def processRecordsCli(self, unmarkedRecords, args): + raise NotImplementedError() + + def processRecordsCliLabelUpdate(self, lines, args): + raise NotImplementedError() + + def writeLabelledOutput(self, updatedRecords, args): + raise NotImplementedError() + + def writeLabelledOutputFromPandas(self, candidate_pairs_pd, args): + raise NotImplementedError() + + def setArguments(self, args: Arguments) -> None: + self.args = args + + def getArguments(self) -> Arguments: + return self.args + + def getOptions(self) -> ClientOptions: + return self.options + + def setOptions(self, options: ClientOptions) -> None: + self.options = options + + def getMarkedRecordsStat(self, markedRecords, value): + raise NotImplementedError() + + def getMatchedMarkedRecordsStat(self): + raise NotImplementedError() + + def getUnmatchedMarkedRecordsStat(self): + raise NotImplementedError() + + def getUnsureMarkedRecordsStat(self): + raise NotImplementedError() + + +class FieldDefinition: + def __init__(self, name: str, dataType: str, *matchType: Union[str, models_v2.MatchType]) -> None: + match_types = [] + for mt in matchType: + if not isinstance(mt, models_v2.MatchType): + mt = models_v2.MatchType(mt) + + self._model_v2 = models_v2.FieldDefinition(fieldName=name, fields=name, dataType=dataType, matchType=match_types) + + def setStopWords(self, stopWords: str) -> None: + self._model_v2.stopWords = stopWords + + def getFieldDefinition(self) -> str: + return self._model_v2.model_dump_json() + + def to_v2(self) -> models_v2.FieldDefinition: + return self._model_v2 + + +class ClientOptions: + def __init__(self, argsSent: Optional[Sequence[str]]) -> None: + if argsSent is None: + args = [] + else: + args = [a for a in argsSent] + + self._opt_v2 = models_v2.ClientOptions(**{k: v for k, v in zip(args[:-1], args[1:])}) + print("arguments for client options are ", self._opt_v2.to_java_args()) + + def getClientOptions(self) -> str: + return " ".join(self._opt_v2.to_java_args()) + + def getOptionValue(self, option: str) -> str: + if option.startswith("--"): + option = option[2:] + + if not hasattr(self._opt_v2, option): + _msg = "Wrong option; possible options are: " + _msg += ", ".join(f.name for f in fields(self._opt_v2)) + raise KeyError(_msg) + + return getattr(self._opt_v2, option) + + def setOptionValue(self, option: str, value: str) -> None: + if option.startswith("--"): + option = option[2:] + + if not hasattr(self._opt_v2, option): + _msg = "Wrong option; possible options are: " + _msg += ", ".join(f.name for f in fields(self._opt_v2)) + raise KeyError(_msg) + + setattr(self._opt_v2, option, value) + + def getPhase(self) -> str: + return self._opt_v2.phase + + def setPhase(self, newValue: str) -> None: + self._opt_v2.phase = newValue + + def getConf(self) -> str: + return self._opt_v2.conf + + def hasLocation(self) -> bool: + return self._opt_v2.location is None + + def getLocation(self) -> Optional[str]: + return self._opt_v2.location + + def to_v2(self) -> models_v2.ClientOptions: + return self._opt_v2 + + +class Arguments: + def __init__(self): + self._args_v2 = models_v2.Arguments() + + @staticmethod + def _from_v2(arguments_v2: models_v2.Arguments) -> "Arguments": + new_obj = Arguments() + new_obj._args_v2 = arguments_v2 + return new_obj + + def setFieldDefinition(self, fieldDef: list[FieldDefinition]) -> None: + self._args_v2.fieldDefinition = [fd.to_v2() for fd in fieldDef] + + def setData(self, *pipes: Pipe) -> None: + self._args_v2.data = [pp.to_v2() for pp in pipes] + + def setOutput(self, *pipes: Pipe) -> None: + self._args_v2.output = [pp.to_v2() for pp in pipes] + + def getZinggBaseModelDir(self) -> str: + if isinstance(self._args_v2.modelId, int): + model_id = str(self._args_v2.modelId) + else: + model_id = self._args_v2.modelId + + return os.path.join( + self._args_v2.zinggDir, + model_id, + ) + + def getZinggModelDir(self) -> str: + return os.path.join(self.getZinggBaseModelDir(), "model") + + def getZinggBaseTrainingDataDir(self): + return os.path.join( + self.getZinggBaseModelDir(), + "trainingData", + ) + + def getZinggTrainingDataUnmarkedDir(self) -> str: + return os.path.join( + self.getZinggBaseTrainingDataDir(), + "unmarked", + ) + + def getZinggTrainingDataMarkedDir(self) -> str: + return os.path.join( + self.getZinggBaseTrainingDataDir(), + "marked", + ) + + def setTrainingSamples(self, *pipes: Pipe) -> None: + self._args_v2.trainingSamples = [pp.to_v2() for pp in pipes] + + def setModelId(self, id: str) -> None: + self._args_v2.modelId = id + + def getModelId(self): + return self._args_v2.modelId + + def setZinggDir(self, f: str) -> None: + self._args_v2.zinggDir = f + + def setNumPartitions(self, numPartitions: int) -> None: + self._args_v2.numPartitions = numPartitions + + def setLabelDataSampleSize(self, labelDataSampleSize: float) -> None: + self._args_v2.labelDataSampleSize = labelDataSampleSize + + def writeArgumentsToJSON(self, fileName: str) -> None: + with open(fileName, "w") as f_: + json.dump( + self._args_v2.model_dump_json(), + f_, + ) + + def setStopWordsCutoff(self, stopWordsCutoff: float) -> None: + self._args_v2.stopWordsCutoff = stopWordsCutoff + + def setColumn(self, column: str): + self._args_v2.column = column + + @staticmethod + def createArgumentsFromJSON(fileName: str, phase: str) -> "Arguments": + with open(fileName, "r") as f_: + json_string = json.load(f_) + + return Arguments.createArgumentsFromJSONString(json_string, phase) + + def writeArgumentsToJSONString(self) -> str: + return self._args_v2.model_dump_json() + + @staticmethod + def createArgumentsFromJSONString(jsonArgs: str, phase: str): + args_v2 = models_v2.Arguments.model_validate(json.loads(jsonArgs)) + + if not args_v2.validate(phase): + raise ZinggArgumentsValidationError("Wrong args for the given phase") + + return Arguments._from_v2(args_v2) + + def copyArgs(self, phase): + argsString = self.writeArgumentsToJSONString() + return self.createArgumentsFromJSONString(argsString, phase) + + def to_v2(self) -> models_v2.Arguments: + return self._args_v2 diff --git a/python/zingg_v2/errors.py b/python/zingg_v2/errors.py index 7e1b60793..dc0658031 100644 --- a/python/zingg_v2/errors.py +++ b/python/zingg_v2/errors.py @@ -1,2 +1,5 @@ -class JobBuilderNotInitialized(ValueError): +class ZinggArgumentsValidationError(ValueError): + pass + +class ZinggParameterIsNotSet(ValueError): pass diff --git a/python/zingg_v2/models.py b/python/zingg_v2/models.py new file mode 100644 index 000000000..15b0dab9c --- /dev/null +++ b/python/zingg_v2/models.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import itertools +from dataclasses import asdict, dataclass +from enum import StrEnum, auto +from typing import Any, Optional, Union + +from pydantic import BaseModel + + +class MatchType(StrEnum): + FUZZY = auto() + EXACT = auto() + DONT_USE = auto() + EMAIL = auto() + PINCODE = auto() + NULL_OR_BLANK = auto() + TEXT = auto() + NUMERIC = auto() + NUMERIC_WITH_UNITS = auto() + ONLY_ALPHABETS_EXACT = auto() + ONLY_ALPHABETS_FUZZY = auto() + + +class FileFormat(StrEnum): + CSV = auto() + PARQUET = auto() + JSON = auto() + TEXT = auto() + XLS = "com.crealytics.spark.excel" + AVRO = auto() + JDBC = auto() + CASSANDRA = "org.apache.spark.sql.cassandra" + SNOWFLAKE = "net.snowflake.spark.snowflake" + ELASTIC = "org.elasticsearch.spark.sql" + EXACOL = "com.exasol.spark" + BIGQUERY = auto() + INMEMORY = auto() + + +class FieldPreprocessor(StrEnum): + NONE = auto() + CHINESE_COMPANY_STOP_WORD_REMOVER = auto() + CHINESE_NAME_STOP_WORD_REMOVER = auto() + MOBILE_STOP_WORD_REMOVER = auto() + CAMERA_STOP_WORD_REMOVER = auto() + THAI_COLOR_STOP_WORD_REMOVER = auto() + PUNCTUATION_CHARS_STOP_WORD_REMOVER = auto() + ADDRESS_STOP_WORD_REMOVER = auto() + PERFUME_STOP_WORD_REMOVER = auto() + FRENCH_COMPANY_STOP_WORD_REMOVER = auto() + ENGLISH_COMPANY_STOP_WORD_REMOVER = auto() + AUTO_STOP_WORD_REMOVER = auto() + POWER_TOOLS_STOP_WORD_REMOVER = auto() + DOMAIN_EXTRACTOR = auto() + JAPANESE_COMPANY_STOP_WORD_REMOVER = auto() + JAPANESE_NAME_STOP_WORD_REMOVER = auto() + + +class FieldDefinition(BaseModel): + matchType: Union[MatchType, list[MatchType]] + dataType: str + fieldName: str + fields: str + stopWords: Optional[str] = None + abbreviations: Optional[str] = None + + +class Pipe(BaseModel): + name: str + format: FileFormat + preprocessors: Optional[FieldPreprocessor] = None + props: dict[str, Any] = {} + schema: Optional[str] = None + mode: Optional[str] = None + + +class Arguments(BaseModel): + output: Optional[list[Pipe]] = None + data: Optional[list[Pipe]] = None + zinggDir: str = "/tmp/zingg" + trainingSamples: Optional[list[Pipe]] = None + fieldDefinition: Optional[list[FieldDefinition]] = None + numPartitions: int = 10 + labelDataSampleSize: float = 0.01 + modelId: Union[str, int] = "1" + threshold: float = 0.5 + jobId: int = 1 + collectMetrics: bool = True + showConcise: bool = False + stopWordsCutoff: float = 0.1 + blockSize: int = 100 + column: Optional[str] = None + + def validate(self, phase: str) -> bool: + is_valid = True + if phase in ["train", "match", "trainMatch", "link"]: + is_valid &= self.trainingSamples is not None + is_valid &= self.data is not None + is_valid &= self.numPartitions is not None + is_valid &= self.fieldDefinition is not None + + elif phase in ["seed", "seedDB"]: + is_valid &= self.data is not None + is_valid &= self.numPartitions is not None + is_valid &= self.fieldDefinition is not None + + elif phase != "WEB": + is_valid &= self.data is not None + is_valid &= self.numPartitions is not None + + return is_valid + + +@dataclass +class ClientOptions: + phase: str = "peekModel" + license: str = "zinggLic.txt" + email: str = "zingg@zingg.ai" + conf: str = "dummyConf.json" + preprocess: Optional[str] = None + jobId: Optional[str] = None + format: Optional[str] = None + zinggDir: Optional[str] = None + modelId: Optional[str] = None + collectMetrics: Optional[str] = None + showConcise: Optional[str] = None + location: Optional[str] = None + column: Optional[str] = None + remote: Optional[str] = None + + def to_java_args(self) -> list[str]: + return list(itertools.chain.from_iterable([[f"--{key}", value] for key, value in asdict(self) if value is not None])) diff --git a/python/zingg_v2/pipes.py b/python/zingg_v2/pipes.py index 59e3213ef..f6f2b27db 100644 --- a/python/zingg_v2/pipes.py +++ b/python/zingg_v2/pipes.py @@ -8,35 +8,34 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType -from zingg_v2.structs import ZinggFileFormat +from zingg_v2 import models as models_v2 class Pipe: - def __init__(self, name: str, format: Union[str, ZinggFileFormat]) -> None: - self.name = name - if not isinstance(format, ZinggFileFormat): - format = ZinggFileFormat(format) - self.format = format - self.properties: dict[str, str] = {} - self.schema: Optional[str] = None - - def getPipe(self): - # TODO: implement it - raise NotImplementedError() + def __init__(self, name: str, format: Union[str, models_v2.FileFormat]) -> None: + if not isinstance(format, models_v2.FileFormat): + format = models_v2.FileFormat(format) + self._pipe_v2 = models_v2.Pipe(name=name, format=format) + + def getPipe(self) -> str: + return self.toString() def addProperty(self, name: str, value: str) -> None: - self.properties[name] = value + self._pipe_v2.props[name] = value def setSchema(self, schema: str) -> None: self.schema = schema def toString(self) -> str: - return json.dumps({"name": self.name, "format": self.format, "schema": self.schema, "properties": json.dumps(self.properties)}) + return json.dumps(self._pipe_v2.model_dump_json()) + + def to_v2(self) -> models_v2.Pipe: + return self._pipe_v2 class CsvPipe(Pipe): def __init__(self, name: str, location: Optional[str] = None, schema: Optional[str] = None) -> None: - super().__init__(name, ZinggFileFormat.CSV) + super().__init__(name, models_v2.FileFormat.CSV) if schema is not None: self.setSchema(schema) if location is not None: @@ -54,7 +53,7 @@ def setHeader(self, header: str) -> None: class BigQueryPipe(Pipe): def __init__(self, name: str) -> None: - super().__init__(name, ZinggFileFormat.BIGQUERY) + super().__init__(name, models_v2.FileFormat.BIGQUERY) def setCredentialFile(self, credentials_file: str) -> None: self.addProperty("credentialsFile", credentials_file) @@ -71,7 +70,7 @@ def setViewsEnabled(self, isEnabled: bool) -> None: class SnowflakePipe(Pipe): def __init__(self, name: str) -> None: - super().__init__(name, ZinggFileFormat.SNOWFLAKE) + super().__init__(name, models_v2.FileFormat.SNOWFLAKE) self.addProperty("application", "zinggai_zingg") def setUrl(self, url: str) -> None: @@ -98,7 +97,7 @@ def setDbTable(self, dbtable: str) -> None: class InMemoryPipe(Pipe): def __init__(self, name: str, df: Optional[Union[DataFrame, PDataFrame]] = None) -> None: - super().__init__(name, ZinggFileFormat.INMEMORY) + super().__init__(name, models_v2.FileFormat.INMEMORY) self.df: Optional[DataFrame] = None if df is not None: self.setDataset(df) diff --git a/python/zingg_v2/structs.py b/python/zingg_v2/structs.py deleted file mode 100644 index 26be7cf5e..000000000 --- a/python/zingg_v2/structs.py +++ /dev/null @@ -1,134 +0,0 @@ -from __future__ import annotations - -from dataclasses import asdict, dataclass, fields -from enum import StrEnum, auto -from typing import Any, Optional, Sequence, Union - -from pandas.core.frame import itertools - - -class ZinggMatchType(StrEnum): - FUZZY = auto() - EXACT = auto() - DONT_USE = auto() - EMAIL = auto() - PINCODE = auto() - NULL_OR_BLANK = auto() - TEXT = auto() - NUMERIC = auto() - NUMERIC_WITH_UNITS = auto() - ONLY_ALPHABETS_EXACT = auto() - ONLY_ALPHABETS_FUZZY = auto() - - -class ZinggFileFormat(StrEnum): - CSV = auto() - PARQUET = auto() - JSON = auto() - TEXT = auto() - XLS = "com.crealytics.spark.excel" - AVRO = auto() - JDBC = auto() - CASSANDRA = "org.apache.spark.sql.cassandra" - SNOWFLAKE = "net.snowflake.spark.snowflake" - ELASTIC = "org.elasticsearch.spark.sql" - EXACOL = "com.exasol.spark" - BIGQUERY = auto() - INMEMORY = auto() - - -class FieldDefinition: - def __init__(self, name: str, dataType: str, *matchType: Union[str, ZinggMatchType]) -> None: - self.name = name - self.dataType = dataType - self.match_types = [] - for mt in matchType: - if not isinstance(mt, ZinggMatchType): - mt = ZinggMatchType(mt) - - self.match_types.append(mt) - - self.stopwords: Optional[str] = None - - def setStopWords(self, stopWords: str) -> None: - self.stopwords = stopWords - - def getFieldDefinition(self) -> Any: - # TODO: imeplement it - # A single point where all the interactions with JVM should be - raise NotImplementedError() - - -@dataclass -class ClientOptionsV2: - phase: str = "peekModel" - license: str = "zinggLic.txt" - email: str = "zingg@zingg.ai" - conf: str = "dummyConf.json" - preprocess: Optional[str] = None - jobId: Optional[str] = None - format: Optional[str] = None - zinggDir: Optional[str] = None - modelId: Optional[str] = None - collectMetrics: Optional[str] = None - showConcise: Optional[str] = None - location: Optional[str] = None - column: Optional[str] = None - remote: Optional[str] = None - - def to_java_args(self) -> list[str]: - return list(itertools.chain.from_iterable([[f"--{key}", value] for key, value in asdict(self) if value is not None])) - - -class ClientOptions: - def __init__(self, argsSent: Optional[Sequence[str]]) -> None: - if argsSent is None: - args = [] - else: - args = [a for a in argsSent] - - self._opt_v2 = ClientOptionsV2(**{k: v for k, v in zip(args[:-1], args[1:])}) - print("arguments for client options are ", self._opt_v2.to_java_args()) - - def getClientOptions(self): - java_args = self._opt_v2.to_java_args() - # TODO: implement it by passing options ot JVM - # A single point where all the interactions with JVM should be - raise NotImplementedError() - - def getOptionValue(self, option: str) -> str: - if option.startswith("--"): - option = option[2:] - - if not hasattr(self._opt_v2, option): - _msg = "Wrong option; possible options are: " - _msg += ", ".join(f.name for f in fields(self._opt_v2)) - raise KeyError(_msg) - - return getattr(self._opt_v2, option) - - def setOptionValue(self, option: str, value: str) -> None: - if option.startswith("--"): - option = option[2:] - - if not hasattr(self._opt_v2, option): - _msg = "Wrong option; possible options are: " - _msg += ", ".join(f.name for f in fields(self._opt_v2)) - raise KeyError(_msg) - - setattr(self._opt_v2, option, value) - - def getPhase(self) -> str: - return self._opt_v2.phase - - def setPhase(self, newValue: str) -> None: - self._opt_v2.phase = newValue - - def getConf(self) -> str: - return self._opt_v2.conf - - def hasLocation(self) -> bool: - return self._opt_v2.location is None - - def getLocation(self) -> Optional[str]: - return self._opt_v2.location From 23df1eec9c222d5b100e4f6df392ca5074a3bf65 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Thu, 11 Apr 2024 19:11:49 +0200 Subject: [PATCH 120/219] Update from comments in PR On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: python/zingg_v2/models.py modified: python/zingg_v2/pipes.py --- python/zingg_v2/models.py | 24 ++---------------------- python/zingg_v2/pipes.py | 14 +++++++------- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/python/zingg_v2/models.py b/python/zingg_v2/models.py index 15b0dab9c..227da7859 100644 --- a/python/zingg_v2/models.py +++ b/python/zingg_v2/models.py @@ -22,7 +22,7 @@ class MatchType(StrEnum): ONLY_ALPHABETS_FUZZY = auto() -class FileFormat(StrEnum): +class DataFormat(StrEnum): CSV = auto() PARQUET = auto() JSON = auto() @@ -38,25 +38,6 @@ class FileFormat(StrEnum): INMEMORY = auto() -class FieldPreprocessor(StrEnum): - NONE = auto() - CHINESE_COMPANY_STOP_WORD_REMOVER = auto() - CHINESE_NAME_STOP_WORD_REMOVER = auto() - MOBILE_STOP_WORD_REMOVER = auto() - CAMERA_STOP_WORD_REMOVER = auto() - THAI_COLOR_STOP_WORD_REMOVER = auto() - PUNCTUATION_CHARS_STOP_WORD_REMOVER = auto() - ADDRESS_STOP_WORD_REMOVER = auto() - PERFUME_STOP_WORD_REMOVER = auto() - FRENCH_COMPANY_STOP_WORD_REMOVER = auto() - ENGLISH_COMPANY_STOP_WORD_REMOVER = auto() - AUTO_STOP_WORD_REMOVER = auto() - POWER_TOOLS_STOP_WORD_REMOVER = auto() - DOMAIN_EXTRACTOR = auto() - JAPANESE_COMPANY_STOP_WORD_REMOVER = auto() - JAPANESE_NAME_STOP_WORD_REMOVER = auto() - - class FieldDefinition(BaseModel): matchType: Union[MatchType, list[MatchType]] dataType: str @@ -68,8 +49,7 @@ class FieldDefinition(BaseModel): class Pipe(BaseModel): name: str - format: FileFormat - preprocessors: Optional[FieldPreprocessor] = None + format: DataFormat props: dict[str, Any] = {} schema: Optional[str] = None mode: Optional[str] = None diff --git a/python/zingg_v2/pipes.py b/python/zingg_v2/pipes.py index f6f2b27db..60135fe1e 100644 --- a/python/zingg_v2/pipes.py +++ b/python/zingg_v2/pipes.py @@ -12,9 +12,9 @@ class Pipe: - def __init__(self, name: str, format: Union[str, models_v2.FileFormat]) -> None: - if not isinstance(format, models_v2.FileFormat): - format = models_v2.FileFormat(format) + def __init__(self, name: str, format: Union[str, models_v2.DataFormat]) -> None: + if not isinstance(format, models_v2.DataFormat): + format = models_v2.DataFormat(format) self._pipe_v2 = models_v2.Pipe(name=name, format=format) def getPipe(self) -> str: @@ -35,7 +35,7 @@ def to_v2(self) -> models_v2.Pipe: class CsvPipe(Pipe): def __init__(self, name: str, location: Optional[str] = None, schema: Optional[str] = None) -> None: - super().__init__(name, models_v2.FileFormat.CSV) + super().__init__(name, models_v2.DataFormat.CSV) if schema is not None: self.setSchema(schema) if location is not None: @@ -53,7 +53,7 @@ def setHeader(self, header: str) -> None: class BigQueryPipe(Pipe): def __init__(self, name: str) -> None: - super().__init__(name, models_v2.FileFormat.BIGQUERY) + super().__init__(name, models_v2.DataFormat.BIGQUERY) def setCredentialFile(self, credentials_file: str) -> None: self.addProperty("credentialsFile", credentials_file) @@ -70,7 +70,7 @@ def setViewsEnabled(self, isEnabled: bool) -> None: class SnowflakePipe(Pipe): def __init__(self, name: str) -> None: - super().__init__(name, models_v2.FileFormat.SNOWFLAKE) + super().__init__(name, models_v2.DataFormat.SNOWFLAKE) self.addProperty("application", "zinggai_zingg") def setUrl(self, url: str) -> None: @@ -97,7 +97,7 @@ def setDbTable(self, dbtable: str) -> None: class InMemoryPipe(Pipe): def __init__(self, name: str, df: Optional[Union[DataFrame, PDataFrame]] = None) -> None: - super().__init__(name, models_v2.FileFormat.INMEMORY) + super().__init__(name, models_v2.DataFormat.INMEMORY) self.df: Optional[DataFrame] = None if df is not None: self.setDataset(df) From 71bf65faab169abbe771f2d940d16f4fe2b07219 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Mon, 15 Apr 2024 22:51:25 +0200 Subject: [PATCH 121/219] Batch of changes On branch main Your branch is up to date with 'origin/main'. Changes to be committed: new file: buf.gen.yaml new file: buf.work.yaml new file: protobuf/connect_plugins.proto modified: python/zingg_v2/client.py new file: python/zingg_v2/connect.py modified: python/zingg_v2/models.py new file: python/zingg_v2/proto/connect_plugins_pb2.py new file: python/zingg_v2/proto/connect_plugins_pb2.pyi new file: python/zingg_v2/proto/connect_plugins_pb2_grpc.py --- buf.gen.yaml | 9 +++++ buf.work.yaml | 3 ++ protobuf/connect_plugins.proto | 6 ++++ python/zingg_v2/client.py | 9 +++-- python/zingg_v2/connect.py | 25 ++++++++++++++ python/zingg_v2/models.py | 2 +- python/zingg_v2/proto/connect_plugins_pb2.py | 26 +++++++++++++++ python/zingg_v2/proto/connect_plugins_pb2.pyi | 33 +++++++++++++++++++ .../proto/connect_plugins_pb2_grpc.py | 4 +++ 9 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 buf.gen.yaml create mode 100644 buf.work.yaml create mode 100644 protobuf/connect_plugins.proto create mode 100644 python/zingg_v2/connect.py create mode 100644 python/zingg_v2/proto/connect_plugins_pb2.py create mode 100644 python/zingg_v2/proto/connect_plugins_pb2.pyi create mode 100644 python/zingg_v2/proto/connect_plugins_pb2_grpc.py diff --git a/buf.gen.yaml b/buf.gen.yaml new file mode 100644 index 000000000..9fb08f4a5 --- /dev/null +++ b/buf.gen.yaml @@ -0,0 +1,9 @@ +version: v1 +plugins: + # Building the Python build and building the mypy interfaces. + - plugin: buf.build/protocolbuffers/python:v25.3 + out: python/zingg_v2/proto + - plugin: buf.build/grpc/python:v1.62.0 + out: python/zingg_v2/proto + - plugin: buf.build/community/nipunn1313-mypy:v3.5.0 + out: python/zingg_v2/proto diff --git a/buf.work.yaml b/buf.work.yaml new file mode 100644 index 000000000..540a3936d --- /dev/null +++ b/buf.work.yaml @@ -0,0 +1,3 @@ +version: v1 +directories: + - protobuf diff --git a/protobuf/connect_plugins.proto b/protobuf/connect_plugins.proto new file mode 100644 index 000000000..ff0c64b6f --- /dev/null +++ b/protobuf/connect_plugins.proto @@ -0,0 +1,6 @@ +syntax = 'proto3'; + +message SubmitZinggJob { + string args = 1; + string options = 2; +} diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py index 863bcf236..29c491b63 100644 --- a/python/zingg_v2/client.py +++ b/python/zingg_v2/client.py @@ -19,6 +19,11 @@ def execute(self) -> None: # TODO: implement it # java_args: arguments in form of string # that is pairs of --key value + + if spark_connect: + .. + else: + spark_classic java_args = self.options.getClientOptions() # java_arguments is JSON definition of Zingg Job @@ -242,9 +247,9 @@ def writeArgumentsToJSONString(self) -> str: @staticmethod def createArgumentsFromJSONString(jsonArgs: str, phase: str): - args_v2 = models_v2.Arguments.model_validate(json.loads(jsonArgs)) + args_v2 = models_v2.Arguments.model_validate(jsonArgs) - if not args_v2.validate(phase): + if not args_v2.validate_phase(phase): raise ZinggArgumentsValidationError("Wrong args for the given phase") return Arguments._from_v2(args_v2) diff --git a/python/zingg_v2/connect.py b/python/zingg_v2/connect.py new file mode 100644 index 000000000..69071e0f3 --- /dev/null +++ b/python/zingg_v2/connect.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyspark.sql.connect import proto +from pyspark.sql.connect.plan import LogicalPlan + +if TYPE_CHECKING: + from pyspark.sql.connect.client import SparkConnectClient + +from zingg_v2.proto.connect_plugins_pb2 import SubmitZinggJob + + +class ZinggJob(LogicalPlan): + def __init__(self, zingg_args: str, zingg_job: str) -> None: + super().__init__(None) + self._args = zingg_args + self._job_json = zingg_job + + def plan(self, session: SparkConnectClient) -> proto.Relation: + plan = self._create_proto_relation() + zingg_submit = SubmitZinggJob(args=self._args, options=self._job_json) + plan.extension.Pack(zingg_submit) + + return plan diff --git a/python/zingg_v2/models.py b/python/zingg_v2/models.py index 227da7859..6aed4507c 100644 --- a/python/zingg_v2/models.py +++ b/python/zingg_v2/models.py @@ -72,7 +72,7 @@ class Arguments(BaseModel): blockSize: int = 100 column: Optional[str] = None - def validate(self, phase: str) -> bool: + def validate_phase(self, phase: str) -> bool: is_valid = True if phase in ["train", "match", "trainMatch", "link"]: is_valid &= self.trainingSamples is not None diff --git a/python/zingg_v2/proto/connect_plugins_pb2.py b/python/zingg_v2/proto/connect_plugins_pb2.py new file mode 100644 index 000000000..200f2b079 --- /dev/null +++ b/python/zingg_v2/proto/connect_plugins_pb2.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: connect_plugins.proto +# Protobuf Python Version: 4.25.3 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63onnect_plugins.proto\">\n\x0eSubmitZinggJob\x12\x12\n\x04\x61rgs\x18\x01 \x01(\tR\x04\x61rgs\x12\x18\n\x07options\x18\x02 \x01(\tR\x07optionsb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'connect_plugins_pb2', _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _globals['_SUBMITZINGGJOB']._serialized_start=25 + _globals['_SUBMITZINGGJOB']._serialized_end=87 +# @@protoc_insertion_point(module_scope) diff --git a/python/zingg_v2/proto/connect_plugins_pb2.pyi b/python/zingg_v2/proto/connect_plugins_pb2.pyi new file mode 100644 index 000000000..a49afce1d --- /dev/null +++ b/python/zingg_v2/proto/connect_plugins_pb2.pyi @@ -0,0 +1,33 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +""" +import builtins +import google.protobuf.descriptor +import google.protobuf.message +import sys + +if sys.version_info >= (3, 8): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +@typing_extensions.final +class SubmitZinggJob(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ARGS_FIELD_NUMBER: builtins.int + OPTIONS_FIELD_NUMBER: builtins.int + args: builtins.str + options: builtins.str + def __init__( + self, + *, + args: builtins.str = ..., + options: builtins.str = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["args", b"args", "options", b"options"]) -> None: ... + +global___SubmitZinggJob = SubmitZinggJob diff --git a/python/zingg_v2/proto/connect_plugins_pb2_grpc.py b/python/zingg_v2/proto/connect_plugins_pb2_grpc.py new file mode 100644 index 000000000..2daafffeb --- /dev/null +++ b/python/zingg_v2/proto/connect_plugins_pb2_grpc.py @@ -0,0 +1,4 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + From ecb06b97ac9f7334a0561f2977a9eb723400d020 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Tue, 16 Apr 2024 22:34:25 +0200 Subject: [PATCH 122/219] Batch of changes On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: python/zingg_v2/client.py --- python/zingg_v2/client.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py index 29c491b63..c3bab31c6 100644 --- a/python/zingg_v2/client.py +++ b/python/zingg_v2/client.py @@ -2,10 +2,15 @@ import json import os +import warnings from dataclasses import fields from typing import Optional, Sequence, Union +from pyspark.sql import SparkSession +from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame + from zingg_v2 import models as models_v2 +from zingg_v2.connect import ZinggJob from zingg_v2.errors import ZinggArgumentsValidationError from zingg_v2.pipes import Pipe @@ -19,15 +24,33 @@ def execute(self) -> None: # TODO: implement it # java_args: arguments in form of string # that is pairs of --key value + java_args = self.options.getClientOptions() + + # java_job_definition is JSON definition of Zingg Job + java_job_definition = self.args.writeArgumentsToJSONString() + + spark = SparkSession.getActiveSession() + + if spark is None: + _warn_msg = "Spark Session is not initialized in the current thread!" + _warn_msg += " It is strongly reccomend to init SparkSession manually!" + warnings.warn(_warn_msg) + spark = SparkSession.builder.getOrCreate() + + spark_connect = hasattr(spark, "_jvm") if spark_connect: - .. + _log_msg = "Submitting a Zingg Job\n" + _log_msg += f"Arguments: {java_args}\n\n" + _log_msg += java_job_definition + _log_msg += "\n\n" + print(java_job_definition) + df = ConnectDataFrame.withPlan(ZinggJob(zingg_args=java_args, zingg_job=java_job_definition), spark) + df_rows = df.collect() + for row in df_rows: + print(row.asDict()) else: spark_classic - java_args = self.options.getClientOptions() - - # java_arguments is JSON definition of Zingg Job - java_arguments = self.args.writeArgumentsToJSONString() raise NotImplementedError() From a035c91eb33329940b3e7a68cef382328179427c Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Sun, 21 Apr 2024 16:48:46 +0200 Subject: [PATCH 123/219] Batch of changes On branch main Your branch is ahead of 'origin/main' by 1 commit. (use "git push" to publish your local commits) Changes to be committed: modified: .gitignore modified: buf.gen.yaml modified: common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java modified: protobuf/connect_plugins.proto modified: python/pyproject.toml modified: python/zingg_v2/client.py modified: python/zingg_v2/models.py modified: python/zingg_v2/proto/connect_plugins_pb2.py new file: scripts/get-spark-connect-local.sh new file: scripts/run-spark-connect-local.sh new file: spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java new file: spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java new file: spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java new file: spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java modified: spark/pom.xml --- .gitignore | 10 + buf.gen.yaml | 3 + .../common/core/executor/LabelUpdater.java | 2 +- protobuf/connect_plugins.proto | 3 + python/pyproject.toml | 2 +- python/zingg_v2/client.py | 85 ++- python/zingg_v2/models.py | 24 +- python/zingg_v2/proto/connect_plugins_pb2.py | 5 +- scripts/get-spark-connect-local.sh | 4 + scripts/run-spark-connect-local.sh | 6 + .../spark/connect/ZinggConnectPlugin.java | 76 ++ .../spark/connect/proto/ConnectPlugins.java | 50 ++ .../spark/connect/proto/SubmitZinggJob.java | 678 ++++++++++++++++++ .../proto/SubmitZinggJobOrBuilder.java | 34 + spark/pom.xml | 12 + 15 files changed, 964 insertions(+), 30 deletions(-) create mode 100644 scripts/get-spark-connect-local.sh create mode 100644 scripts/run-spark-connect-local.sh create mode 100644 spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java create mode 100644 spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java create mode 100644 spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java create mode 100644 spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java diff --git a/.gitignore b/.gitignore index 4a3c2fe76..c75e61ded 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,13 @@ python/docs/_build/_doctrees # Sphinx _build **/_build + +# Helix stuff +.helix + +# JDTLS stuff +.package +.classpath +.project +.settings +.factorypath diff --git a/buf.gen.yaml b/buf.gen.yaml index 9fb08f4a5..3655c2856 100644 --- a/buf.gen.yaml +++ b/buf.gen.yaml @@ -1,5 +1,8 @@ version: v1 plugins: + # Building the Java classes + - plugin: buf.build/protocolbuffers/java:v25.3 + out: spark/client/src/main/java # Building the Python build and building the mypy interfaces. - plugin: buf.build/protocolbuffers/python:v25.3 out: python/zingg_v2/proto diff --git a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java index 4e3365783..e712eda59 100644 --- a/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java +++ b/common/core/src/main/java/zingg/common/core/executor/LabelUpdater.java @@ -154,4 +154,4 @@ protected Pipe getOutputPipe() { } protected abstract Pipe setSaveModeOnPipe(Pipe p); -} \ No newline at end of file +} diff --git a/protobuf/connect_plugins.proto b/protobuf/connect_plugins.proto index ff0c64b6f..085541b5f 100644 --- a/protobuf/connect_plugins.proto +++ b/protobuf/connect_plugins.proto @@ -1,5 +1,8 @@ syntax = 'proto3'; +option java_multiple_files = true; +option java_package = "zingg.spark.connect.proto"; + message SubmitZinggJob { string args = 1; string options = 2; diff --git a/python/pyproject.toml b/python/pyproject.toml index 40a91838f..6ce24f6ce 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,4 +18,4 @@ readme = "README.md" requires-python = ">=3.11" [tool.ruff] -line-length = 150 +line-length = 110 diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py index c3bab31c6..7e7075f47 100644 --- a/python/zingg_v2/client.py +++ b/python/zingg_v2/client.py @@ -6,8 +6,9 @@ from dataclasses import fields from typing import Optional, Sequence, Union -from pyspark.sql import SparkSession +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame +from pyspark.sql.connect.session import SparkSession as ConnectSession from zingg_v2 import models as models_v2 from zingg_v2.connect import ZinggJob @@ -19,8 +20,15 @@ class Zingg: def __init__(self, args: Arguments, options: ClientOptions) -> None: self.args = args self.options = options + self.spark: Union[SparkSession, ConnectSession] = SparkSession.getActiveSession() - def execute(self) -> None: + if self.spark is None: + _warn_msg = "Spark Session is not initialized in the current thread!" + _warn_msg += " It is strongly reccomend to init SparkSession manually!" + warnings.warn(_warn_msg) + self.spark = SparkSession.builder.getOrCreate() + + def execute(self) -> Zingg: # TODO: implement it # java_args: arguments in form of string # that is pairs of --key value @@ -29,30 +37,48 @@ def execute(self) -> None: # java_job_definition is JSON definition of Zingg Job java_job_definition = self.args.writeArgumentsToJSONString() - spark = SparkSession.getActiveSession() + spark_connect = hasattr(self.spark, "_jvm") - if spark is None: - _warn_msg = "Spark Session is not initialized in the current thread!" - _warn_msg += " It is strongly reccomend to init SparkSession manually!" - warnings.warn(_warn_msg) - spark = SparkSession.builder.getOrCreate() - - spark_connect = hasattr(spark, "_jvm") - - if spark_connect: + if not spark_connect: _log_msg = "Submitting a Zingg Job\n" _log_msg += f"Arguments: {java_args}\n\n" _log_msg += java_job_definition _log_msg += "\n\n" print(java_job_definition) - df = ConnectDataFrame.withPlan(ZinggJob(zingg_args=java_args, zingg_job=java_job_definition), spark) - df_rows = df.collect() - for row in df_rows: - print(row.asDict()) + df = ConnectDataFrame.withPlan( + ZinggJob(zingg_args=java_args, zingg_job=java_job_definition), self.spark + ) + output = df.collect()[0].asDict() + status: str = output["status"] + new_args: str = output["newArgs"] + else: - spark_classic + # TODO: Put that logic into Java by creating an entry point for Python API? + j_options = self.spark._jvm.zingg.common.client.ClientOptions(java_args) + j_args = self.spark._jvm.zingg.common.client.ArgumentsUtil.createArgumentsFromJSONString( + java_job_definition, + self.options.getPhase(), + ) + client = self.spark._jvm.zingg.spark.client( + j_args, + j_options, + self.spark._jsci, + ) + client.init() + client.execute() + client.postMetrics() - raise NotImplementedError() + status = "SUCCESS" + new_args: str = self.spark._jvm.zingg.client.ArgumentsUtil.writeArgumentstoJSONString( + client.getArguments() + ) + + print(f"Zingg Job output status: {status}") + + return Zingg( + Arguments.createArgumentsFromJSONString(new_args, self.options.getPhase()), + self.options.make_copy(), + ) def executeLabel(self) -> None: raise NotImplementedError() @@ -60,13 +86,19 @@ def executeLabel(self) -> None: def executeLabelUpdate(self) -> None: raise NotImplementedError() - def getMarkedRecords(self) -> None: - raise NotImplementedError() + def getMarkedRecords(self) -> Union[DataFrame, ConnectDataFrame]: + marked_path = self.args.getZinggTrainingDataMarkedDir() + marked = self.spark.read.parquet(marked_path) + return marked - def getUnmarkedRecords(self) -> None: - raise NotImplementedError() + def getUnmarkedRecords(self) -> Union[DataFrame, ConnectDataFrame]: + unmarked_path = self.args.getZinggTrainingDataUnmarkedDir() + unmarked = self.spark.read.parquet(unmarked_path) + return unmarked - def processRecordsCli(self, unmarkedRecords, args): + def processRecordsCli( + self, unmarkedRecords: Union[DataFrame, ConnectDataFrame], args: Arguments + ) -> Union[DataFrame, ConnectDataFrame]: raise NotImplementedError() def processRecordsCliLabelUpdate(self, lines, args): @@ -110,7 +142,9 @@ def __init__(self, name: str, dataType: str, *matchType: Union[str, models_v2.Ma if not isinstance(mt, models_v2.MatchType): mt = models_v2.MatchType(mt) - self._model_v2 = models_v2.FieldDefinition(fieldName=name, fields=name, dataType=dataType, matchType=match_types) + self._model_v2 = models_v2.FieldDefinition( + fieldName=name, fields=name, dataType=dataType, matchType=match_types + ) def setStopWords(self, stopWords: str) -> None: self._model_v2.stopWords = stopWords @@ -175,6 +209,9 @@ def getLocation(self) -> Optional[str]: def to_v2(self) -> models_v2.ClientOptions: return self._opt_v2 + def make_copy(self) -> ClientOptions: + return ClientOptions(self._opt_v2.to_java_args()) + class Arguments: def __init__(self): diff --git a/python/zingg_v2/models.py b/python/zingg_v2/models.py index 6aed4507c..ce25eb13d 100644 --- a/python/zingg_v2/models.py +++ b/python/zingg_v2/models.py @@ -5,7 +5,7 @@ from enum import StrEnum, auto from typing import Any, Optional, Union -from pydantic import BaseModel +from pydantic import BaseModel, Field, ValidationError, field_validator class MatchType(StrEnum): @@ -51,7 +51,9 @@ class Pipe(BaseModel): name: str format: DataFormat props: dict[str, Any] = {} - schema: Optional[str] = None + # "schema" is a built in attribute of BaseModel + # that is why we need that alias: + schema_field: Optional[str] = Field(default=None, alias="schema") mode: Optional[str] = None @@ -72,6 +74,24 @@ class Arguments(BaseModel): blockSize: int = 100 column: Optional[str] = None + @field_validator("numPartitions") + @classmethod + def validate_num_partitions(cls, v: int) -> int: + if (v != -1) or (v <= 0): + _err_msg = "Number of partitions can be greater than 0 for user specified partitioning or equal to -1 for system decided partitioning" + raise ValidationError(_err_msg) + + return v + + @field_validator("labelDataSampleSize", "stopWordsCutoff") + @classmethod + def validate_relative_size(cls, v: float) -> float: + if (v > 1) or (v < 0): + _err_msg = "Label Data Sample Size should be between 0 and 1" + raise ValidationError(_err_msg) + + return v + def validate_phase(self, phase: str) -> bool: is_valid = True if phase in ["train", "match", "trainMatch", "link"]: diff --git a/python/zingg_v2/proto/connect_plugins_pb2.py b/python/zingg_v2/proto/connect_plugins_pb2.py index 200f2b079..9323eb5f6 100644 --- a/python/zingg_v2/proto/connect_plugins_pb2.py +++ b/python/zingg_v2/proto/connect_plugins_pb2.py @@ -14,13 +14,14 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63onnect_plugins.proto\">\n\x0eSubmitZinggJob\x12\x12\n\x04\x61rgs\x18\x01 \x01(\tR\x04\x61rgs\x12\x18\n\x07options\x18\x02 \x01(\tR\x07optionsb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63onnect_plugins.proto\">\n\x0eSubmitZinggJob\x12\x12\n\x04\x61rgs\x18\x01 \x01(\tR\x04\x61rgs\x12\x18\n\x07options\x18\x02 \x01(\tR\x07optionsB\x1d\n\x19zingg.spark.connect.protoP\x01\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'connect_plugins_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None + _globals['DESCRIPTOR']._options = None + _globals['DESCRIPTOR']._serialized_options = b'\n\031zingg.spark.connect.protoP\001' _globals['_SUBMITZINGGJOB']._serialized_start=25 _globals['_SUBMITZINGGJOB']._serialized_end=87 # @@protoc_insertion_point(module_scope) diff --git a/scripts/get-spark-connect-local.sh b/scripts/get-spark-connect-local.sh new file mode 100644 index 000000000..43e878102 --- /dev/null +++ b/scripts/get-spark-connect-local.sh @@ -0,0 +1,4 @@ +#!/usr/bin/bash + +wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz +tar -xvf spark-3.5.1-bin-hadoop3.tgz diff --git a/scripts/run-spark-connect-local.sh b/scripts/run-spark-connect-local.sh new file mode 100644 index 000000000..a13b834fe --- /dev/null +++ b/scripts/run-spark-connect-local.sh @@ -0,0 +1,6 @@ +#!/usr/bin/bash + + spark-3.5.1-bin-hadoop3/sbin/start-connect-server.sh \ + --package org.apache.spark:spark-connect_2.12:3.5.1 \ + --jars zing_jar_path + --conf spark.connect.extensions.relation.classes=zingg.spark.connect.ZinggSparkConnectPlugin diff --git a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java new file mode 100644 index 000000000..15ef4dde8 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java @@ -0,0 +1,76 @@ +package zingg.spark.connect; + +import com.google.protobuf.Any; +import com.google.protobuf.InvalidProtocolBufferException; +import com.sun.tools.javac.util.List; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; +import org.apache.spark.sql.connect.planner.SparkConnectPlanner; +import org.apache.spark.sql.connect.plugin.RelationPlugin; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import scala.Option; +import zingg.common.client.ArgumentsUtil; +import zingg.common.client.ClientOptions; +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; +import zingg.spark.client.SparkClient; +import zingg.spark.connect.proto.SubmitZinggJob; + +class ZinggConnectPlugin implements RelationPlugin { + @Override + public Option transform(org.sparkproject.connect.protobuf.Any relation, SparkConnectPlanner planner) { + if (relation.is(SubmitZinggJob.class)) { + SubmitZinggJob message; + try { + message = relation.unpack(SubmitZinggJob.class); + } catch (InvalidProtocolBufferException e) { + // Should be unreachable due to explicit check of the message type + throw new RuntimeException(e); + } + String cliArgs = message.getArgs(); + String options = message.getOptions(); + // Parsing of options and arguments + ClientOptions clientOptions = new ClientOptions(cliArgs); + IArguments arguments; + try { + arguments = new ArgumentsUtil().createArgumentsFromJSONString( + options, + clientOptions.getOptionValue(ClientOptions.PHASE) + ); + } catch (ZinggClientException e) { + throw new RuntimeException(e); + } + // Get active session and create Zingg Client + try (SparkSession session = planner.sessionHolder().session()) { + SparkClient client = new SparkClient(arguments, clientOptions, session); + // TODO: How to capture logs to send them back to client? + + // Run the job + client.init(); + client.execute(); + client.postMetrics(); + + // Build an output DataFrame object + IArguments newArgs = client.getArguments(); + Dataset outputDf = session.createDataFrame( + List.of(RowFactory.create("SUCESS", new ArgumentsUtil().writeArgumentstoJSONString(newArgs))), + new StructType( + new StructField[]{ + DataTypes.createStructField("status", DataTypes.StringType, false), + DataTypes.createStructField("newArgs", DataTypes.StringType, false) + } + ) + ); + return Option.apply(outputDf.logicalPlan()); + } catch (ZinggClientException e) { + throw new RuntimeException(e); + } + } + return Option.empty(); + } +} diff --git a/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java b/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java new file mode 100644 index 000000000..506523040 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java @@ -0,0 +1,50 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: connect_plugins.proto + +// Protobuf Java Version: 3.25.3 +package zingg.spark.connect.proto; + +public final class ConnectPlugins { + private ConnectPlugins() {} + public static void registerAllExtensions( + com.google.protobuf.ExtensionRegistryLite registry) { + } + + public static void registerAllExtensions( + com.google.protobuf.ExtensionRegistry registry) { + registerAllExtensions( + (com.google.protobuf.ExtensionRegistryLite) registry); + } + static final com.google.protobuf.Descriptors.Descriptor + internal_static_SubmitZinggJob_descriptor; + static final + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_SubmitZinggJob_fieldAccessorTable; + + public static com.google.protobuf.Descriptors.FileDescriptor + getDescriptor() { + return descriptor; + } + private static com.google.protobuf.Descriptors.FileDescriptor + descriptor; + static { + java.lang.String[] descriptorData = { + "\n\025connect_plugins.proto\">\n\016SubmitZinggJo" + + "b\022\022\n\004args\030\001 \001(\tR\004args\022\030\n\007options\030\002 \001(\tR\007" + + "optionsB\035\n\031zingg.spark.connect.protoP\001b\006" + + "proto3" + }; + descriptor = com.google.protobuf.Descriptors.FileDescriptor + .internalBuildGeneratedFileFrom(descriptorData, + new com.google.protobuf.Descriptors.FileDescriptor[] { + }); + internal_static_SubmitZinggJob_descriptor = + getDescriptor().getMessageTypes().get(0); + internal_static_SubmitZinggJob_fieldAccessorTable = new + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_SubmitZinggJob_descriptor, + new java.lang.String[] { "Args", "Options", }); + } + + // @@protoc_insertion_point(outer_class_scope) +} diff --git a/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java new file mode 100644 index 000000000..4a74b18a4 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java @@ -0,0 +1,678 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: connect_plugins.proto + +// Protobuf Java Version: 3.25.3 +package zingg.spark.connect.proto; + +/** + * Protobuf type {@code SubmitZinggJob} + */ +public final class SubmitZinggJob extends + com.google.protobuf.GeneratedMessageV3 implements + // @@protoc_insertion_point(message_implements:SubmitZinggJob) + SubmitZinggJobOrBuilder { +private static final long serialVersionUID = 0L; + // Use SubmitZinggJob.newBuilder() to construct. + private SubmitZinggJob(com.google.protobuf.GeneratedMessageV3.Builder builder) { + super(builder); + } + private SubmitZinggJob() { + args_ = ""; + options_ = ""; + } + + @java.lang.Override + @SuppressWarnings({"unused"}) + protected java.lang.Object newInstance( + UnusedPrivateParameter unused) { + return new SubmitZinggJob(); + } + + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return zingg.spark.connect.proto.ConnectPlugins.internal_static_SubmitZinggJob_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internalGetFieldAccessorTable() { + return zingg.spark.connect.proto.ConnectPlugins.internal_static_SubmitZinggJob_fieldAccessorTable + .ensureFieldAccessorsInitialized( + zingg.spark.connect.proto.SubmitZinggJob.class, zingg.spark.connect.proto.SubmitZinggJob.Builder.class); + } + + public static final int ARGS_FIELD_NUMBER = 1; + @SuppressWarnings("serial") + private volatile java.lang.Object args_ = ""; + /** + * string args = 1 [json_name = "args"]; + * @return The args. + */ + @java.lang.Override + public java.lang.String getArgs() { + java.lang.Object ref = args_; + if (ref instanceof java.lang.String) { + return (java.lang.String) ref; + } else { + com.google.protobuf.ByteString bs = + (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + args_ = s; + return s; + } + } + /** + * string args = 1 [json_name = "args"]; + * @return The bytes for args. + */ + @java.lang.Override + public com.google.protobuf.ByteString + getArgsBytes() { + java.lang.Object ref = args_; + if (ref instanceof java.lang.String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8( + (java.lang.String) ref); + args_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + + public static final int OPTIONS_FIELD_NUMBER = 2; + @SuppressWarnings("serial") + private volatile java.lang.Object options_ = ""; + /** + * string options = 2 [json_name = "options"]; + * @return The options. + */ + @java.lang.Override + public java.lang.String getOptions() { + java.lang.Object ref = options_; + if (ref instanceof java.lang.String) { + return (java.lang.String) ref; + } else { + com.google.protobuf.ByteString bs = + (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + options_ = s; + return s; + } + } + /** + * string options = 2 [json_name = "options"]; + * @return The bytes for options. + */ + @java.lang.Override + public com.google.protobuf.ByteString + getOptionsBytes() { + java.lang.Object ref = options_; + if (ref instanceof java.lang.String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8( + (java.lang.String) ref); + options_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + + private byte memoizedIsInitialized = -1; + @java.lang.Override + public final boolean isInitialized() { + byte isInitialized = memoizedIsInitialized; + if (isInitialized == 1) return true; + if (isInitialized == 0) return false; + + memoizedIsInitialized = 1; + return true; + } + + @java.lang.Override + public void writeTo(com.google.protobuf.CodedOutputStream output) + throws java.io.IOException { + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(args_)) { + com.google.protobuf.GeneratedMessageV3.writeString(output, 1, args_); + } + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(options_)) { + com.google.protobuf.GeneratedMessageV3.writeString(output, 2, options_); + } + getUnknownFields().writeTo(output); + } + + @java.lang.Override + public int getSerializedSize() { + int size = memoizedSize; + if (size != -1) return size; + + size = 0; + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(args_)) { + size += com.google.protobuf.GeneratedMessageV3.computeStringSize(1, args_); + } + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(options_)) { + size += com.google.protobuf.GeneratedMessageV3.computeStringSize(2, options_); + } + size += getUnknownFields().getSerializedSize(); + memoizedSize = size; + return size; + } + + @java.lang.Override + public boolean equals(final java.lang.Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof zingg.spark.connect.proto.SubmitZinggJob)) { + return super.equals(obj); + } + zingg.spark.connect.proto.SubmitZinggJob other = (zingg.spark.connect.proto.SubmitZinggJob) obj; + + if (!getArgs() + .equals(other.getArgs())) return false; + if (!getOptions() + .equals(other.getOptions())) return false; + if (!getUnknownFields().equals(other.getUnknownFields())) return false; + return true; + } + + @java.lang.Override + public int hashCode() { + if (memoizedHashCode != 0) { + return memoizedHashCode; + } + int hash = 41; + hash = (19 * hash) + getDescriptor().hashCode(); + hash = (37 * hash) + ARGS_FIELD_NUMBER; + hash = (53 * hash) + getArgs().hashCode(); + hash = (37 * hash) + OPTIONS_FIELD_NUMBER; + hash = (53 * hash) + getOptions().hashCode(); + hash = (29 * hash) + getUnknownFields().hashCode(); + memoizedHashCode = hash; + return hash; + } + + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + java.nio.ByteBuffer data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + java.nio.ByteBuffer data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + com.google.protobuf.ByteString data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + com.google.protobuf.ByteString data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom(byte[] data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + byte[] data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom(java.io.InputStream input) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3 + .parseWithIOException(PARSER, input); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3 + .parseWithIOException(PARSER, input, extensionRegistry); + } + + public static zingg.spark.connect.proto.SubmitZinggJob parseDelimitedFrom(java.io.InputStream input) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3 + .parseDelimitedWithIOException(PARSER, input); + } + + public static zingg.spark.connect.proto.SubmitZinggJob parseDelimitedFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3 + .parseDelimitedWithIOException(PARSER, input, extensionRegistry); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + com.google.protobuf.CodedInputStream input) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3 + .parseWithIOException(PARSER, input); + } + public static zingg.spark.connect.proto.SubmitZinggJob parseFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3 + .parseWithIOException(PARSER, input, extensionRegistry); + } + + @java.lang.Override + public Builder newBuilderForType() { return newBuilder(); } + public static Builder newBuilder() { + return DEFAULT_INSTANCE.toBuilder(); + } + public static Builder newBuilder(zingg.spark.connect.proto.SubmitZinggJob prototype) { + return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype); + } + @java.lang.Override + public Builder toBuilder() { + return this == DEFAULT_INSTANCE + ? new Builder() : new Builder().mergeFrom(this); + } + + @java.lang.Override + protected Builder newBuilderForType( + com.google.protobuf.GeneratedMessageV3.BuilderParent parent) { + Builder builder = new Builder(parent); + return builder; + } + /** + * Protobuf type {@code SubmitZinggJob} + */ + public static final class Builder extends + com.google.protobuf.GeneratedMessageV3.Builder implements + // @@protoc_insertion_point(builder_implements:SubmitZinggJob) + zingg.spark.connect.proto.SubmitZinggJobOrBuilder { + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return zingg.spark.connect.proto.ConnectPlugins.internal_static_SubmitZinggJob_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internalGetFieldAccessorTable() { + return zingg.spark.connect.proto.ConnectPlugins.internal_static_SubmitZinggJob_fieldAccessorTable + .ensureFieldAccessorsInitialized( + zingg.spark.connect.proto.SubmitZinggJob.class, zingg.spark.connect.proto.SubmitZinggJob.Builder.class); + } + + // Construct using zingg.spark.connect.proto.SubmitZinggJob.newBuilder() + private Builder() { + + } + + private Builder( + com.google.protobuf.GeneratedMessageV3.BuilderParent parent) { + super(parent); + + } + @java.lang.Override + public Builder clear() { + super.clear(); + bitField0_ = 0; + args_ = ""; + options_ = ""; + return this; + } + + @java.lang.Override + public com.google.protobuf.Descriptors.Descriptor + getDescriptorForType() { + return zingg.spark.connect.proto.ConnectPlugins.internal_static_SubmitZinggJob_descriptor; + } + + @java.lang.Override + public zingg.spark.connect.proto.SubmitZinggJob getDefaultInstanceForType() { + return zingg.spark.connect.proto.SubmitZinggJob.getDefaultInstance(); + } + + @java.lang.Override + public zingg.spark.connect.proto.SubmitZinggJob build() { + zingg.spark.connect.proto.SubmitZinggJob result = buildPartial(); + if (!result.isInitialized()) { + throw newUninitializedMessageException(result); + } + return result; + } + + @java.lang.Override + public zingg.spark.connect.proto.SubmitZinggJob buildPartial() { + zingg.spark.connect.proto.SubmitZinggJob result = new zingg.spark.connect.proto.SubmitZinggJob(this); + if (bitField0_ != 0) { buildPartial0(result); } + onBuilt(); + return result; + } + + private void buildPartial0(zingg.spark.connect.proto.SubmitZinggJob result) { + int from_bitField0_ = bitField0_; + if (((from_bitField0_ & 0x00000001) != 0)) { + result.args_ = args_; + } + if (((from_bitField0_ & 0x00000002) != 0)) { + result.options_ = options_; + } + } + + @java.lang.Override + public Builder clone() { + return super.clone(); + } + @java.lang.Override + public Builder setField( + com.google.protobuf.Descriptors.FieldDescriptor field, + java.lang.Object value) { + return super.setField(field, value); + } + @java.lang.Override + public Builder clearField( + com.google.protobuf.Descriptors.FieldDescriptor field) { + return super.clearField(field); + } + @java.lang.Override + public Builder clearOneof( + com.google.protobuf.Descriptors.OneofDescriptor oneof) { + return super.clearOneof(oneof); + } + @java.lang.Override + public Builder setRepeatedField( + com.google.protobuf.Descriptors.FieldDescriptor field, + int index, java.lang.Object value) { + return super.setRepeatedField(field, index, value); + } + @java.lang.Override + public Builder addRepeatedField( + com.google.protobuf.Descriptors.FieldDescriptor field, + java.lang.Object value) { + return super.addRepeatedField(field, value); + } + @java.lang.Override + public Builder mergeFrom(com.google.protobuf.Message other) { + if (other instanceof zingg.spark.connect.proto.SubmitZinggJob) { + return mergeFrom((zingg.spark.connect.proto.SubmitZinggJob)other); + } else { + super.mergeFrom(other); + return this; + } + } + + public Builder mergeFrom(zingg.spark.connect.proto.SubmitZinggJob other) { + if (other == zingg.spark.connect.proto.SubmitZinggJob.getDefaultInstance()) return this; + if (!other.getArgs().isEmpty()) { + args_ = other.args_; + bitField0_ |= 0x00000001; + onChanged(); + } + if (!other.getOptions().isEmpty()) { + options_ = other.options_; + bitField0_ |= 0x00000002; + onChanged(); + } + this.mergeUnknownFields(other.getUnknownFields()); + onChanged(); + return this; + } + + @java.lang.Override + public final boolean isInitialized() { + return true; + } + + @java.lang.Override + public Builder mergeFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + if (extensionRegistry == null) { + throw new java.lang.NullPointerException(); + } + try { + boolean done = false; + while (!done) { + int tag = input.readTag(); + switch (tag) { + case 0: + done = true; + break; + case 10: { + args_ = input.readStringRequireUtf8(); + bitField0_ |= 0x00000001; + break; + } // case 10 + case 18: { + options_ = input.readStringRequireUtf8(); + bitField0_ |= 0x00000002; + break; + } // case 18 + default: { + if (!super.parseUnknownField(input, extensionRegistry, tag)) { + done = true; // was an endgroup tag + } + break; + } // default: + } // switch (tag) + } // while (!done) + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.unwrapIOException(); + } finally { + onChanged(); + } // finally + return this; + } + private int bitField0_; + + private java.lang.Object args_ = ""; + /** + * string args = 1 [json_name = "args"]; + * @return The args. + */ + public java.lang.String getArgs() { + java.lang.Object ref = args_; + if (!(ref instanceof java.lang.String)) { + com.google.protobuf.ByteString bs = + (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + args_ = s; + return s; + } else { + return (java.lang.String) ref; + } + } + /** + * string args = 1 [json_name = "args"]; + * @return The bytes for args. + */ + public com.google.protobuf.ByteString + getArgsBytes() { + java.lang.Object ref = args_; + if (ref instanceof String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8( + (java.lang.String) ref); + args_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + /** + * string args = 1 [json_name = "args"]; + * @param value The args to set. + * @return This builder for chaining. + */ + public Builder setArgs( + java.lang.String value) { + if (value == null) { throw new NullPointerException(); } + args_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + /** + * string args = 1 [json_name = "args"]; + * @return This builder for chaining. + */ + public Builder clearArgs() { + args_ = getDefaultInstance().getArgs(); + bitField0_ = (bitField0_ & ~0x00000001); + onChanged(); + return this; + } + /** + * string args = 1 [json_name = "args"]; + * @param value The bytes for args to set. + * @return This builder for chaining. + */ + public Builder setArgsBytes( + com.google.protobuf.ByteString value) { + if (value == null) { throw new NullPointerException(); } + checkByteStringIsUtf8(value); + args_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + + private java.lang.Object options_ = ""; + /** + * string options = 2 [json_name = "options"]; + * @return The options. + */ + public java.lang.String getOptions() { + java.lang.Object ref = options_; + if (!(ref instanceof java.lang.String)) { + com.google.protobuf.ByteString bs = + (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + options_ = s; + return s; + } else { + return (java.lang.String) ref; + } + } + /** + * string options = 2 [json_name = "options"]; + * @return The bytes for options. + */ + public com.google.protobuf.ByteString + getOptionsBytes() { + java.lang.Object ref = options_; + if (ref instanceof String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8( + (java.lang.String) ref); + options_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + /** + * string options = 2 [json_name = "options"]; + * @param value The options to set. + * @return This builder for chaining. + */ + public Builder setOptions( + java.lang.String value) { + if (value == null) { throw new NullPointerException(); } + options_ = value; + bitField0_ |= 0x00000002; + onChanged(); + return this; + } + /** + * string options = 2 [json_name = "options"]; + * @return This builder for chaining. + */ + public Builder clearOptions() { + options_ = getDefaultInstance().getOptions(); + bitField0_ = (bitField0_ & ~0x00000002); + onChanged(); + return this; + } + /** + * string options = 2 [json_name = "options"]; + * @param value The bytes for options to set. + * @return This builder for chaining. + */ + public Builder setOptionsBytes( + com.google.protobuf.ByteString value) { + if (value == null) { throw new NullPointerException(); } + checkByteStringIsUtf8(value); + options_ = value; + bitField0_ |= 0x00000002; + onChanged(); + return this; + } + @java.lang.Override + public final Builder setUnknownFields( + final com.google.protobuf.UnknownFieldSet unknownFields) { + return super.setUnknownFields(unknownFields); + } + + @java.lang.Override + public final Builder mergeUnknownFields( + final com.google.protobuf.UnknownFieldSet unknownFields) { + return super.mergeUnknownFields(unknownFields); + } + + + // @@protoc_insertion_point(builder_scope:SubmitZinggJob) + } + + // @@protoc_insertion_point(class_scope:SubmitZinggJob) + private static final zingg.spark.connect.proto.SubmitZinggJob DEFAULT_INSTANCE; + static { + DEFAULT_INSTANCE = new zingg.spark.connect.proto.SubmitZinggJob(); + } + + public static zingg.spark.connect.proto.SubmitZinggJob getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + private static final com.google.protobuf.Parser + PARSER = new com.google.protobuf.AbstractParser() { + @java.lang.Override + public SubmitZinggJob parsePartialFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + Builder builder = newBuilder(); + try { + builder.mergeFrom(input, extensionRegistry); + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.setUnfinishedMessage(builder.buildPartial()); + } catch (com.google.protobuf.UninitializedMessageException e) { + throw e.asInvalidProtocolBufferException().setUnfinishedMessage(builder.buildPartial()); + } catch (java.io.IOException e) { + throw new com.google.protobuf.InvalidProtocolBufferException(e) + .setUnfinishedMessage(builder.buildPartial()); + } + return builder.buildPartial(); + } + }; + + public static com.google.protobuf.Parser parser() { + return PARSER; + } + + @java.lang.Override + public com.google.protobuf.Parser getParserForType() { + return PARSER; + } + + @java.lang.Override + public zingg.spark.connect.proto.SubmitZinggJob getDefaultInstanceForType() { + return DEFAULT_INSTANCE; + } + +} + diff --git a/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java new file mode 100644 index 000000000..a231069d6 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java @@ -0,0 +1,34 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: connect_plugins.proto + +// Protobuf Java Version: 3.25.3 +package zingg.spark.connect.proto; + +public interface SubmitZinggJobOrBuilder extends + // @@protoc_insertion_point(interface_extends:SubmitZinggJob) + com.google.protobuf.MessageOrBuilder { + + /** + * string args = 1 [json_name = "args"]; + * @return The args. + */ + java.lang.String getArgs(); + /** + * string args = 1 [json_name = "args"]; + * @return The bytes for args. + */ + com.google.protobuf.ByteString + getArgsBytes(); + + /** + * string options = 2 [json_name = "options"]; + * @return The options. + */ + java.lang.String getOptions(); + /** + * string options = 2 [json_name = "options"]; + * @return The bytes for options. + */ + com.google.protobuf.ByteString + getOptionsBytes(); +} diff --git a/spark/pom.xml b/spark/pom.xml index 2ea784073..8d09195da 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -13,6 +13,18 @@ client + + org.apache.spark + spark-connect_${scala.binary.version} + ${spark.version} + + + com.google.guava + guava + + + provided + org.apache.spark spark-mllib_${scala.binary.version} From b73888b0a93aca077384464f40312feab5774e93 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 27 Mar 2024 13:54:26 +0530 Subject: [PATCH 124/219] pluggable canopy --- .../java/zingg/common/core/block/Block.java | 20 +++++++++++-------- .../java/zingg/common/core/block/Canopy.java | 14 ++++++------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/block/Block.java b/common/core/src/main/java/zingg/common/core/block/Block.java index 35bde6b54..b304c1b48 100644 --- a/common/core/src/main/java/zingg/common/core/block/Block.java +++ b/common/core/src/main/java/zingg/common/core/block/Block.java @@ -66,16 +66,13 @@ public void setDupes(ZFrame dupes) { /** * @return the types * - * public Class[] getTypes() { return types; } */ /** * @param types - * the types to set + * the types to set * - * public void setTypes(Class[] types) { this.types = types; } - * - * /** + * * @return the maxSize */ public long getMaxSize() { @@ -84,7 +81,7 @@ public long getMaxSize() { /** * @param maxSize - * the maxSize to set + * the maxSize to set */ public void setMaxSize(long maxSize) { this.maxSize = maxSize; @@ -102,10 +99,13 @@ protected void setFunctionsMap(ListMap> m) { this.functionsMap = m; } + protected Canopy getCanopy(){ + return new Canopy(); + } public CanopygetNodeFromCurrent(Canopynode, HashFunction function, FieldDefinition context) { - Canopytrial = new Canopy(); + Canopytrial = getCanopy(); trial = node.copyTo(trial); // node.training, node.dupeN, function, context); trial.function = function; @@ -113,6 +113,10 @@ protected void setFunctionsMap(ListMap> m) { return trial; } + public void estimateElimCount(Canopy c, long elimCount) { + c.estimateElimCount(); + } + public abstract T getDataTypeFromString(String t); public CanopygetBestNode(Tree> tree, Canopyparent, Canopynode, @@ -144,7 +148,7 @@ protected void setFunctionsMap(ListMap> m) { + " and function " + function + " for " + field.dataType); Canopytrial = getNodeFromCurrent(node, function, context); - trial.estimateElimCount(); + estimateElimCount(trial, least); long elimCount = trial.getElimCount(); diff --git a/common/core/src/main/java/zingg/common/core/block/Canopy.java b/common/core/src/main/java/zingg/common/core/block/Canopy.java index 25f0d4124..09451c56d 100644 --- a/common/core/src/main/java/zingg/common/core/block/Canopy.java +++ b/common/core/src/main/java/zingg/common/core/block/Canopy.java @@ -20,19 +20,19 @@ public class Canopy implements Serializable { public static final Log LOG = LogFactory.getLog(Canopy.class); // created by function edge leading from parent to this node - HashFunction function; + protected HashFunction function; // aplied on field - FieldDefinition context; + protected FieldDefinition context; // list of duplicates passed from parent - List dupeN; + protected List dupeN; // number of duplicates eliminated after function applied on fn context - long elimCount; + protected long elimCount; // hash of canopy - Object hash; + protected Object hash; // training set - List training; + protected List training; // duplicates remaining after function is applied - List dupeRemaining; + protected List dupeRemaining; public Canopy() { } From bd48ac9dabed30ff3f578829a14986c82c8cb177 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 27 Mar 2024 22:46:37 +0530 Subject: [PATCH 125/219] debug logs in if then else --- .../java/zingg/common/core/block/Block.java | 18 +++++++++++++----- .../zingg/common/core/hash/FirstChars.java | 2 +- .../src/test/java/zingg/block/TestBlock.java | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/block/Block.java b/common/core/src/main/java/zingg/common/core/block/Block.java index b304c1b48..06c0e8c13 100644 --- a/common/core/src/main/java/zingg/common/core/block/Block.java +++ b/common/core/src/main/java/zingg/common/core/block/Block.java @@ -126,14 +126,18 @@ public void estimateElimCount(Canopy c, long elimCount) { Canopybest = null; for (FieldDefinition field : fieldsOfInterest) { - LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt " - + getDataTypeFromString(field.getDataType())); + if (LOG.isDebugEnabled()){ + LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt " + + getDataTypeFromString(field.getDataType())); + } //Class type = FieldClass.getFieldClassClass(field.getFieldClass()); FieldDefinition context = field; if (least ==0) break;//how much better can it get? // applicable functions List> functions = functionsMap.get(getDataTypeFromString(field.getDataType())); - LOG.debug("functions are " + functions); + if (LOG.isDebugEnabled()){ + LOG.debug("functions are " + functions); + } if (functions != null) { @@ -144,8 +148,10 @@ public void estimateElimCount(Canopy c, long elimCount) { //!childless.contains(function, field.fieldName) ) { - LOG.debug("Evaluating field " + field.fieldName + if (LOG.isDebugEnabled()){ + LOG.debug("Evaluating field " + field.fieldName + " and function " + function + " for " + field.dataType); + } Canopytrial = getNodeFromCurrent(node, function, context); estimateElimCount(trial, least); @@ -182,7 +188,9 @@ public void estimateElimCount(Canopy c, long elimCount) { }*/ } else { - LOG.debug("No child " + function); + if (LOG.isDebugEnabled()){ + LOG.debug("No child " + function); + } //childless.add(function, field.fieldName); } diff --git a/common/core/src/main/java/zingg/common/core/hash/FirstChars.java b/common/core/src/main/java/zingg/common/core/hash/FirstChars.java index 116b67cc9..78ad3042d 100644 --- a/common/core/src/main/java/zingg/common/core/hash/FirstChars.java +++ b/common/core/src/main/java/zingg/common/core/hash/FirstChars.java @@ -32,7 +32,7 @@ public String call(String field) { r = field.trim().substring(0, endIndex); } } - LOG.debug("Applying " + this.getName() + " on " + field + " and returning " + r); + //LOG.debug("Applying " + this.getName() + " on " + field + " and returning " + r); return r; } diff --git a/spark/core/src/test/java/zingg/block/TestBlock.java b/spark/core/src/test/java/zingg/block/TestBlock.java index 5d80ca66a..17cbdb93a 100644 --- a/spark/core/src/test/java/zingg/block/TestBlock.java +++ b/spark/core/src/test/java/zingg/block/TestBlock.java @@ -50,6 +50,7 @@ public void testTree() throws Throwable { // primary deciding is unique year so identityInteger should have been picked Canopy head = blockingTree.getHead(); assertEquals("identityInteger", head.getFunction().getName()); + blockingTree.toString(); } From 9b8ec9f88fbcedaf490483af50ce0216875b35d1 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Mon, 22 Apr 2024 16:31:40 +0200 Subject: [PATCH 126/219] Semi-working version of On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: buf.gen.yaml new file: python/test_spark_connect.py modified: python/zingg_v2/client.py modified: python/zingg_v2/models.py modified: scripts/run-spark-connect-local.sh deleted: spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java new file: spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala modified: spark/pom.xml Untracked files: spark-3.5.1-bin-hadoop3.tgz spark-3.5.1-bin-hadoop3/ --- buf.gen.yaml | 1 + python/test_spark_connect.py | 10 +++ python/zingg_v2/client.py | 4 +- python/zingg_v2/models.py | 8 +- scripts/run-spark-connect-local.sh | 8 +- .../spark/connect/ZinggConnectPlugin.java | 76 ------------------- .../spark/connect/ZinggConnectPlugin.scala | 42 ++++++++++ spark/pom.xml | 26 ++++++- 8 files changed, 91 insertions(+), 84 deletions(-) create mode 100644 python/test_spark_connect.py delete mode 100644 spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java create mode 100644 spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala diff --git a/buf.gen.yaml b/buf.gen.yaml index 3655c2856..3265e6a2c 100644 --- a/buf.gen.yaml +++ b/buf.gen.yaml @@ -10,3 +10,4 @@ plugins: out: python/zingg_v2/proto - plugin: buf.build/community/nipunn1313-mypy:v3.5.0 out: python/zingg_v2/proto + diff --git a/python/test_spark_connect.py b/python/test_spark_connect.py new file mode 100644 index 000000000..6221d4bc3 --- /dev/null +++ b/python/test_spark_connect.py @@ -0,0 +1,10 @@ +from zingg_v2.client import Zingg, Arguments, ClientOptions +from pyspark.sql.connect.session import SparkSession + + +if __name__ == "__main__": + spark = SparkSession.builder.remote("sc://localhost").getOrCreate() + opts = ClientOptions(None) + args = Arguments.createArgumentsFromJSON(fileName="../examples/febrl/config.json", phase="peekModel") + zingg = Zingg(args=args, options=opts) + zingg.execute() diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py index 7e7075f47..f30f78240 100644 --- a/python/zingg_v2/client.py +++ b/python/zingg_v2/client.py @@ -160,10 +160,10 @@ class ClientOptions: def __init__(self, argsSent: Optional[Sequence[str]]) -> None: if argsSent is None: args = [] + self._opt_v2 = models_v2.ClientOptions() else: args = [a for a in argsSent] - - self._opt_v2 = models_v2.ClientOptions(**{k: v for k, v in zip(args[:-1], args[1:])}) + self._opt_v2 = models_v2.ClientOptions(**{k: v for k, v in zip(args[:-1], args[1:])}) print("arguments for client options are ", self._opt_v2.to_java_args()) def getClientOptions(self) -> str: diff --git a/python/zingg_v2/models.py b/python/zingg_v2/models.py index ce25eb13d..2b7e9519d 100644 --- a/python/zingg_v2/models.py +++ b/python/zingg_v2/models.py @@ -77,7 +77,7 @@ class Arguments(BaseModel): @field_validator("numPartitions") @classmethod def validate_num_partitions(cls, v: int) -> int: - if (v != -1) or (v <= 0): + if (v != -1) and (v <= 0): _err_msg = "Number of partitions can be greater than 0 for user specified partitioning or equal to -1 for system decided partitioning" raise ValidationError(_err_msg) @@ -130,4 +130,8 @@ class ClientOptions: remote: Optional[str] = None def to_java_args(self) -> list[str]: - return list(itertools.chain.from_iterable([[f"--{key}", value] for key, value in asdict(self) if value is not None])) + return list( + itertools.chain.from_iterable( + [[f"--{key}", value] for key, value in asdict(self).items() if value is not None] + ) + ) diff --git a/scripts/run-spark-connect-local.sh b/scripts/run-spark-connect-local.sh index a13b834fe..f259ebb6f 100644 --- a/scripts/run-spark-connect-local.sh +++ b/scripts/run-spark-connect-local.sh @@ -1,6 +1,8 @@ #!/usr/bin/bash spark-3.5.1-bin-hadoop3/sbin/start-connect-server.sh \ - --package org.apache.spark:spark-connect_2.12:3.5.1 \ - --jars zing_jar_path - --conf spark.connect.extensions.relation.classes=zingg.spark.connect.ZinggSparkConnectPlugin + --wait \ + --verbose \ + --jars assembly/target/zingg-0.4.0.jar \ + --conf spark.connect.extensions.relation.classes=zingg.spark.connect.ZinggSparkConnectPlugin \ + --packages org.apache.spark:spark-connect_2.12:3.5.1 diff --git a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java deleted file mode 100644 index 15ef4dde8..000000000 --- a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java +++ /dev/null @@ -1,76 +0,0 @@ -package zingg.spark.connect; - -import com.google.protobuf.Any; -import com.google.protobuf.InvalidProtocolBufferException; -import com.sun.tools.javac.util.List; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; -import org.apache.spark.sql.connect.planner.SparkConnectPlanner; -import org.apache.spark.sql.connect.plugin.RelationPlugin; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import scala.Option; -import zingg.common.client.ArgumentsUtil; -import zingg.common.client.ClientOptions; -import zingg.common.client.IArguments; -import zingg.common.client.ZinggClientException; -import zingg.spark.client.SparkClient; -import zingg.spark.connect.proto.SubmitZinggJob; - -class ZinggConnectPlugin implements RelationPlugin { - @Override - public Option transform(org.sparkproject.connect.protobuf.Any relation, SparkConnectPlanner planner) { - if (relation.is(SubmitZinggJob.class)) { - SubmitZinggJob message; - try { - message = relation.unpack(SubmitZinggJob.class); - } catch (InvalidProtocolBufferException e) { - // Should be unreachable due to explicit check of the message type - throw new RuntimeException(e); - } - String cliArgs = message.getArgs(); - String options = message.getOptions(); - // Parsing of options and arguments - ClientOptions clientOptions = new ClientOptions(cliArgs); - IArguments arguments; - try { - arguments = new ArgumentsUtil().createArgumentsFromJSONString( - options, - clientOptions.getOptionValue(ClientOptions.PHASE) - ); - } catch (ZinggClientException e) { - throw new RuntimeException(e); - } - // Get active session and create Zingg Client - try (SparkSession session = planner.sessionHolder().session()) { - SparkClient client = new SparkClient(arguments, clientOptions, session); - // TODO: How to capture logs to send them back to client? - - // Run the job - client.init(); - client.execute(); - client.postMetrics(); - - // Build an output DataFrame object - IArguments newArgs = client.getArguments(); - Dataset outputDf = session.createDataFrame( - List.of(RowFactory.create("SUCESS", new ArgumentsUtil().writeArgumentstoJSONString(newArgs))), - new StructType( - new StructField[]{ - DataTypes.createStructField("status", DataTypes.StringType, false), - DataTypes.createStructField("newArgs", DataTypes.StringType, false) - } - ) - ); - return Option.apply(outputDf.logicalPlan()); - } catch (ZinggClientException e) { - throw new RuntimeException(e); - } - } - return Option.empty(); - } -} diff --git a/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala b/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala new file mode 100644 index 000000000..3a0145faa --- /dev/null +++ b/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala @@ -0,0 +1,42 @@ +package zingg.spark.connect + +import com.google.protobuf +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connect.planner.SparkConnectPlanner +import org.apache.spark.sql.connect.plugin.RelationPlugin +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import zingg.common.client.{ArgumentsUtil, ClientOptions} +import zingg.spark.client.SparkClient +import zingg.spark.connect.proto.SubmitZinggJob + +import scala.collection.JavaConversions._ + +class ZinggConnectPlugin extends RelationPlugin { + override def transform(relation: protobuf.Any, planner: SparkConnectPlanner): Option[LogicalPlan] = { + if (!relation.is(classOf[SubmitZinggJob])) { + Option.empty + } else { + val message = relation.unpack(classOf[SubmitZinggJob]) + val spark = planner.sessionHolder.session + val options = new ClientOptions(message.getOptions) + val args = new ArgumentsUtil().createArgumentsFromJSONString(message.getArgs, options.getOptionValue(ClientOptions.PHASE)) + val client = new SparkClient(args, options, spark) + client.init() + client.execute() + client.postMetrics() + + val outDf = spark.createDataFrame( + Seq(Row("SUCEESS", new ArgumentsUtil().writeArgumentstoJSONString(client.getArguments))), + StructType( + Seq( + StructField("status", StringType, nullable = false), + StructField("newArgs", StringType, nullable = false) + ) + ) + ) + + Option(outDf.queryExecution.logical) + } + } +} diff --git a/spark/pom.xml b/spark/pom.xml index 8d09195da..00174b047 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -54,8 +54,32 @@ graphframes graphframes ${graphframes.version} - +
+ + com.google.protobuf + protobuf-java + 3.25.1 + compile +
+ + + + + net.alchim31.maven + scala-maven-plugin + 4.8.1 + + + + compile + testCompile + + + + + + From 4c928d0f5ba9fd1cd3bd224956091d2087a39eb7 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 23 Apr 2024 17:49:16 +0530 Subject: [PATCH 127/219] added null or blank and exact type to int, long and date --- .../common/core/feature/DateFeature.java | 10 +++++ .../zingg/common/core/feature/IntFeature.java | 10 +++++ .../common/core/feature/LongFeature.java | 8 ++++ .../function/CheckNullFunctionDate.java | 30 +++++++++++++ .../function/CheckNullFunctionInt.java | 30 +++++++++++++ .../function/CheckNullFunctionLong.java | 30 +++++++++++++ .../function/DateSimilarityFunctionExact.java | 23 ++++++++++ .../IntegerSimilarityFunctionExact.java | 21 +++++++++ .../function/LongSimilarityFunctionExact.java | 21 +++++++++ .../function/TestCheckNullFunctionDate.java | 36 ++++++++++++++++ .../function/TestCheckNullFunctionInt.java | 35 +++++++++++++++ .../function/TestCheckNullFunctionLong.java | 35 +++++++++++++++ .../TestDateSimilarityFunctionExact.java | 43 +++++++++++++++++++ .../TestIntegerSimilarityFunctionExact.java | 41 ++++++++++++++++++ .../TestLongSimilarityFunctionExact.java | 41 ++++++++++++++++++ .../configuration/field-definitions.md | 6 +-- 16 files changed, 417 insertions(+), 3 deletions(-) create mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java create mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java create mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionLong.java create mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/DateSimilarityFunctionExact.java create mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java create mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java create mode 100644 common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java create mode 100644 common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java create mode 100644 common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java create mode 100644 common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java create mode 100644 common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java create mode 100644 common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java diff --git a/common/core/src/main/java/zingg/common/core/feature/DateFeature.java b/common/core/src/main/java/zingg/common/core/feature/DateFeature.java index 7809c3b6f..c410d7881 100644 --- a/common/core/src/main/java/zingg/common/core/feature/DateFeature.java +++ b/common/core/src/main/java/zingg/common/core/feature/DateFeature.java @@ -4,10 +4,14 @@ import zingg.common.client.FieldDefinition; import zingg.common.client.MatchType; +import zingg.common.core.similarity.function.CheckNullFunctionDate; import zingg.common.core.similarity.function.DateSimilarityFunction; +import zingg.common.core.similarity.function.DateSimilarityFunctionExact; public class DateFeature extends BaseFeature { + private static final long serialVersionUID = 1L; + public DateFeature() { } @@ -28,6 +32,12 @@ public void init(FieldDefinition f) { if (f.getMatchType().contains(MatchType.FUZZY)) { addSimFunction(new DateSimilarityFunction()); } + if (f.getMatchType().contains(MatchType.EXACT)) { + addSimFunction(new DateSimilarityFunctionExact()); + } + if (f.getMatchType().contains(MatchType.NULL_OR_BLANK)) { + addSimFunction(new CheckNullFunctionDate()); + } } } diff --git a/common/core/src/main/java/zingg/common/core/feature/IntFeature.java b/common/core/src/main/java/zingg/common/core/feature/IntFeature.java index a178ba5ea..3770f8d7d 100644 --- a/common/core/src/main/java/zingg/common/core/feature/IntFeature.java +++ b/common/core/src/main/java/zingg/common/core/feature/IntFeature.java @@ -2,9 +2,13 @@ import zingg.common.client.FieldDefinition; import zingg.common.client.MatchType; +import zingg.common.core.similarity.function.CheckNullFunctionInt; import zingg.common.core.similarity.function.IntegerSimilarityFunction; +import zingg.common.core.similarity.function.IntegerSimilarityFunctionExact; public class IntFeature extends BaseFeature { + private static final long serialVersionUID = 1L; + public IntFeature() { } @@ -14,6 +18,12 @@ public void init(FieldDefinition newParam) { if (newParam.getMatchType().contains(MatchType.FUZZY)) { addSimFunction(new IntegerSimilarityFunction()); } + if (newParam.getMatchType().contains(MatchType.EXACT)) { + addSimFunction(new IntegerSimilarityFunctionExact()); + } + if (newParam.getMatchType().contains(MatchType.NULL_OR_BLANK)) { + addSimFunction(new CheckNullFunctionInt()); + } } } diff --git a/common/core/src/main/java/zingg/common/core/feature/LongFeature.java b/common/core/src/main/java/zingg/common/core/feature/LongFeature.java index 8c3a3c5b4..6f5cba946 100644 --- a/common/core/src/main/java/zingg/common/core/feature/LongFeature.java +++ b/common/core/src/main/java/zingg/common/core/feature/LongFeature.java @@ -2,7 +2,9 @@ import zingg.common.client.FieldDefinition; import zingg.common.client.MatchType; +import zingg.common.core.similarity.function.CheckNullFunctionLong; import zingg.common.core.similarity.function.LongSimilarityFunction; +import zingg.common.core.similarity.function.LongSimilarityFunctionExact; public class LongFeature extends BaseFeature { private static final long serialVersionUID = 1L; @@ -16,6 +18,12 @@ public void init(FieldDefinition newParam) { if (newParam.getMatchType().contains(MatchType.FUZZY)) { addSimFunction(new LongSimilarityFunction()); } + if (newParam.getMatchType().contains(MatchType.EXACT)) { + addSimFunction(new LongSimilarityFunctionExact()); + } + if (newParam.getMatchType().contains(MatchType.NULL_OR_BLANK)) { + addSimFunction(new CheckNullFunctionLong()); + } } } diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java new file mode 100644 index 000000000..9d3db53b7 --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java @@ -0,0 +1,30 @@ +package zingg.common.core.similarity.function; + +import java.util.Date; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class CheckNullFunctionDate extends SimFunction { + + private static final long serialVersionUID = 1L; + public static final Log LOG = LogFactory + .getLog(CheckNullFunctionDate.class); + + public CheckNullFunctionDate() { + super("CheckNullFunctionDate"); + } + + public CheckNullFunctionDate(String name) { + super(name); + } + + @Override + public Double call(Date first, Date second) { + if (first != null && second != null) { + return 1d; + } + return 0d; + } + +} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java new file mode 100644 index 000000000..e6e27ddb8 --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java @@ -0,0 +1,30 @@ +package zingg.common.core.similarity.function; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class CheckNullFunctionInt extends SimFunction { + + private static final long serialVersionUID = 1L; + public static final Log LOG = LogFactory + .getLog(CheckNullFunctionInt.class); + + public CheckNullFunctionInt() { + super("CheckNullFunctionInt"); + } + + public CheckNullFunctionInt(String name) { + super(name); + } + + @Override + public Double call(Integer first, Integer second) { + if (first != null && second != null) { + return 1d; + } + return 0d; + } + + + +} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionLong.java b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionLong.java new file mode 100644 index 000000000..41e753232 --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionLong.java @@ -0,0 +1,30 @@ +package zingg.common.core.similarity.function; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class CheckNullFunctionLong extends SimFunction { + + private static final long serialVersionUID = 1L; + public static final Log LOG = LogFactory + .getLog(CheckNullFunctionLong.class); + + public CheckNullFunctionLong() { + super("CheckNullFunctionLong"); + } + + public CheckNullFunctionLong(String name) { + super(name); + } + + @Override + public Double call(Long first, Long second) { + if (first != null && second != null) { + return 1d; + } + return 0d; + } + + + +} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/DateSimilarityFunctionExact.java b/common/core/src/main/java/zingg/common/core/similarity/function/DateSimilarityFunctionExact.java new file mode 100644 index 000000000..33447c6f8 --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/similarity/function/DateSimilarityFunctionExact.java @@ -0,0 +1,23 @@ +package zingg.common.core.similarity.function; + +import java.util.Date; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class DateSimilarityFunctionExact extends SimFunction { + private static final long serialVersionUID = 1L; + public static final Log LOG = LogFactory + .getLog(DateSimilarityFunctionExact.class); + + public DateSimilarityFunctionExact() { + super("DateSimilarityFunctionExact"); + } + + @Override + public Double call(Date first, Date second) { + if (first == null || second == null) return 1d; + double score = first.equals(second) ? 1d : 0d; + return score; + } +} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java b/common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java new file mode 100644 index 000000000..dbd8df0d0 --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java @@ -0,0 +1,21 @@ +package zingg.common.core.similarity.function; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class IntegerSimilarityFunctionExact extends SimFunction { + private static final long serialVersionUID = 1L; + public static final Log LOG = LogFactory + .getLog(IntegerSimilarityFunctionExact.class); + + public IntegerSimilarityFunctionExact() { + super("IntegerSimilarityFunctionExact"); + } + + @Override + public Double call(Integer first, Integer second) { + if (first == null || second == null) return 1d; + double score = first==second ? 1d : 0d; + return score; + } +} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java b/common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java new file mode 100644 index 000000000..cf9c77d7a --- /dev/null +++ b/common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java @@ -0,0 +1,21 @@ +package zingg.common.core.similarity.function; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class LongSimilarityFunctionExact extends SimFunction { + private static final long serialVersionUID = 1L; + public static final Log LOG = LogFactory + .getLog(LongSimilarityFunctionExact.class); + + public LongSimilarityFunctionExact() { + super("LongSimilarityFunctionExact"); + } + + @Override + public Double call(Long first, Long second) { + if (first == null || second == null) return 1d; + double score = first==second ? 1d : 0d; + return score; + } +} diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java new file mode 100644 index 000000000..6a98e2dad --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java @@ -0,0 +1,36 @@ +package zingg.common.core.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Date; + +import org.junit.jupiter.api.Test; +public class TestCheckNullFunctionDate { + + + @Test + public void testFirstNull() { + CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + assertEquals(0d, isNull.call(null, new Date(2))); + } + + + @Test + public void testSecondNull() { + CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + assertEquals(0d, isNull.call(new Date(1), null)); + } + + @Test + public void testBothNull() { + CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + assertEquals(0d, isNull.call(null, null)); + } + + @Test + public void testBothNotNull() { + CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + assertEquals(1d, isNull.call(new Date(1), new Date(2))); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java new file mode 100644 index 000000000..18ec9825c --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java @@ -0,0 +1,35 @@ +package zingg.common.core.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +public class TestCheckNullFunctionInt { + + + @Test + public void testFirstNull() { + CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + assertEquals(0d, isNull.call(null, 2)); + } + + + @Test + public void testSecondNull() { + CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + assertEquals(0d, isNull.call(1, null)); + } + + @Test + public void testBothNull() { + CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + assertEquals(0d, isNull.call(null, null)); + } + + @Test + public void testBothNotNull() { + CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + assertEquals(1d, isNull.call(1, 2)); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java new file mode 100644 index 000000000..b61923e81 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java @@ -0,0 +1,35 @@ +package zingg.common.core.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +public class TestCheckNullFunctionLong { + + + @Test + public void testFirstNull() { + CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + assertEquals(0d, isNull.call(null, 2l)); + } + + + @Test + public void testSecondNull() { + CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + assertEquals(0d, isNull.call(1l, null)); + } + + @Test + public void testBothNull() { + CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + assertEquals(0d, isNull.call(null, null)); + } + + @Test + public void testBothNotNull() { + CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + assertEquals(1d, isNull.call(1l, 2l)); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java new file mode 100644 index 000000000..e42831e1e --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java @@ -0,0 +1,43 @@ +package zingg.common.core.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Date; + +import org.junit.jupiter.api.Test; + +public class TestDateSimilarityFunctionExact { + + + @Test + public void testFirstNull() { + DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + assertEquals(1d, exact.call(null, new Date(2))); + } + + + @Test + public void testSecondNull() { + DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + assertEquals(1d, exact.call(new Date(1), null)); + } + + @Test + public void testBothNull() { + DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + assertEquals(1d, exact.call(null, null)); + } + + @Test + public void testNotEqual() { + DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + assertEquals(0d, exact.call(new Date(101), new Date(102))); + } + + @Test + public void testEqual() { + DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + assertEquals(1d, exact.call(new Date(101), new Date(101))); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java new file mode 100644 index 000000000..5fd391a5f --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java @@ -0,0 +1,41 @@ +package zingg.common.core.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +public class TestIntegerSimilarityFunctionExact { + + + @Test + public void testFirstNull() { + IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + assertEquals(1d, exact.call(null, 2)); + } + + + @Test + public void testSecondNull() { + IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + assertEquals(1d, exact.call(1, null)); + } + + @Test + public void testBothNull() { + IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + assertEquals(1d, exact.call(null, null)); + } + + @Test + public void testNotEqual() { + IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + assertEquals(0d, exact.call(101, 102)); + } + + @Test + public void testEqual() { + IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + assertEquals(1d, exact.call(101, 101)); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java new file mode 100644 index 000000000..f44be57f8 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java @@ -0,0 +1,41 @@ +package zingg.common.core.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +public class TestLongSimilarityFunctionExact { + + + @Test + public void testFirstNull() { + LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + assertEquals(1d, exact.call(null, 2l)); + } + + + @Test + public void testSecondNull() { + LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + assertEquals(1d, exact.call(1l, null)); + } + + @Test + public void testBothNull() { + LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + assertEquals(1d, exact.call(null, null)); + } + + @Test + public void testNotEqual() { + LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + assertEquals(0d, exact.call(101l, 102l)); + } + + @Test + public void testEqual() { + LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + assertEquals(1d, exact.call(101l, 101l)); + } + +} diff --git a/docs/stepbystep/configuration/field-definitions.md b/docs/stepbystep/configuration/field-definitions.md index d2bd5c291..6c0376983 100644 --- a/docs/stepbystep/configuration/field-definitions.md +++ b/docs/stepbystep/configuration/field-definitions.md @@ -32,12 +32,12 @@ Type of the column - string, integer, double, etc. | Match Type | Description | Can be applied to | | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- | -| FUZZY | Broad matches with typos, abbreviations, and other variations. | string, integer, double, date | -| EXACT | No tolerance with variations, Preferable for country codes, pin codes, and other categorical variables where you expect no variations. | string | +| FUZZY | Broad matches with typos, abbreviations, and other variations. | string, integer, long, double, date | +| EXACT | No tolerance with variations, Preferable for country codes, pin codes, and other categorical variables where you expect no variations. | string, integer, long, date | | DONT\_USE | Appears in the output but no computation is done on these. Helpful for fields like ids that are required in the output. DONT\_USE fields are not shown to the user while labeling, if [showConcise](field-definitions.md#showconcise) is set to true. | any | | EMAIL | Matches only the id part of the email before the @ character | any | | PINCODE | Matches pin codes like xxxxx-xxxx with xxxxx | string | -| NULL\_OR\_BLANK | By default Zingg treats nulls as matches, but if we add this to a field which has other match type like FUZZY, Zingg will build a feature for null values and learn | string | +| NULL\_OR\_BLANK | By default Zingg treats nulls as matches, but if we add this to a field which has other match type like FUZZY, Zingg will build a feature for null values and learn | string, integer, long, date | | TEXT | Compares words overlap between two strings. Good for descriptive fields without much typos | string | | NUMERIC | extracts numbers from strings and compares how many of them are same across both strings, for example apartment numbers. | string | | NUMERIC\_WITH\_UNITS | extracts product codes or numbers with units, for example 16gb from strings and compares how many are same across both strings | string | From 0f04c6c1c0f58878cb633c3454366324949ad2a2 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 24 Apr 2024 11:19:58 +0530 Subject: [PATCH 128/219] aws-s3 coming blank on site and amazons3 not coming , so combining the docs --- docs/connectors/amazons3.md | 25 ------------------------- docs/connectors/aws-s3.md | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 25 deletions(-) delete mode 100644 docs/connectors/amazons3.md diff --git a/docs/connectors/amazons3.md b/docs/connectors/amazons3.md deleted file mode 100644 index 5ee47236f..000000000 --- a/docs/connectors/amazons3.md +++ /dev/null @@ -1,25 +0,0 @@ -# S3 - -1. Set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg - -2. Create aws access key and export via env vars (ensure that the user with below keys has read/write access to above): - -export AWS_ACCESS_KEY_ID= -export AWS_SECRET_ACCESS_KEY= - -(if mfa is enabled AWS_SESSION_TOKEN env var would also be needed ) - -3. Download hadoop-aws-3.1.0.jar and aws-java-sdk-bundle-1.11.271.jar via maven - -4. Set above in zingg.conf : -spark.jars=//hadoop-aws-3.1.0.jar,//aws-java-sdk-bundle-1.11.271.jar - -5. Run using: - - ./scripts/zingg.sh --phase findTrainingData --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg - ./scripts/zingg.sh --phase label --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg - ./scripts/zingg.sh --phase train --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg - ./scripts/zingg.sh --phase match --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg - -6. Models etc. would get saved in -Amazon S3 > Buckets > zingg28032023 >zingg > 100 diff --git a/docs/connectors/aws-s3.md b/docs/connectors/aws-s3.md index f4be12b78..b263139bc 100644 --- a/docs/connectors/aws-s3.md +++ b/docs/connectors/aws-s3.md @@ -1,2 +1,25 @@ # AWS S3 +1. Set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg + +2. Create aws access key and export via env vars (ensure that the user with below keys has read/write access to above): + +export AWS_ACCESS_KEY_ID= +export AWS_SECRET_ACCESS_KEY= + +(if mfa is enabled AWS_SESSION_TOKEN env var would also be needed ) + +3. Download hadoop-aws-3.1.0.jar and aws-java-sdk-bundle-1.11.271.jar via maven + +4. Set above in zingg.conf : +spark.jars=//hadoop-aws-3.1.0.jar,//aws-java-sdk-bundle-1.11.271.jar + +5. Run using: + + ./scripts/zingg.sh --phase findTrainingData --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + ./scripts/zingg.sh --phase label --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + ./scripts/zingg.sh --phase train --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + ./scripts/zingg.sh --phase match --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + +6. Models etc. would get saved in +Amazon S3 > Buckets > zingg28032023 >zingg > 100 From ba3503f5461712249093e3249c9a4ba3a59fd2c2 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 24 Apr 2024 14:33:29 +0530 Subject: [PATCH 129/219] consolidating the classes using template class --- .../common/core/feature/DateFeature.java | 8 ++--- .../zingg/common/core/feature/IntFeature.java | 8 ++--- .../common/core/feature/LongFeature.java | 8 ++--- ...nctionLong.java => CheckNullFunction.java} | 12 +++----- .../function/CheckNullFunctionDate.java | 30 ------------------- .../function/CheckNullFunctionInt.java | 30 ------------------- .../IntegerSimilarityFunctionExact.java | 21 ------------- .../function/LongSimilarityFunctionExact.java | 21 ------------- ...xact.java => SimilarityFunctionExact.java} | 12 ++++---- .../function/TestCheckNullFunctionDate.java | 8 ++--- .../function/TestCheckNullFunctionInt.java | 8 ++--- .../function/TestCheckNullFunctionLong.java | 8 ++--- .../TestDateSimilarityFunctionExact.java | 10 +++---- .../TestIntegerSimilarityFunctionExact.java | 10 +++---- .../TestLongSimilarityFunctionExact.java | 10 +++---- 15 files changed, 48 insertions(+), 156 deletions(-) rename common/core/src/main/java/zingg/common/core/similarity/function/{CheckNullFunctionLong.java => CheckNullFunction.java} (55%) delete mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java delete mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java delete mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java delete mode 100644 common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java rename common/core/src/main/java/zingg/common/core/similarity/function/{DateSimilarityFunctionExact.java => SimilarityFunctionExact.java} (57%) diff --git a/common/core/src/main/java/zingg/common/core/feature/DateFeature.java b/common/core/src/main/java/zingg/common/core/feature/DateFeature.java index c410d7881..230d81972 100644 --- a/common/core/src/main/java/zingg/common/core/feature/DateFeature.java +++ b/common/core/src/main/java/zingg/common/core/feature/DateFeature.java @@ -4,9 +4,9 @@ import zingg.common.client.FieldDefinition; import zingg.common.client.MatchType; -import zingg.common.core.similarity.function.CheckNullFunctionDate; +import zingg.common.core.similarity.function.CheckNullFunction; import zingg.common.core.similarity.function.DateSimilarityFunction; -import zingg.common.core.similarity.function.DateSimilarityFunctionExact; +import zingg.common.core.similarity.function.SimilarityFunctionExact; public class DateFeature extends BaseFeature { @@ -33,10 +33,10 @@ public void init(FieldDefinition f) { addSimFunction(new DateSimilarityFunction()); } if (f.getMatchType().contains(MatchType.EXACT)) { - addSimFunction(new DateSimilarityFunctionExact()); + addSimFunction(new SimilarityFunctionExact("DateSimilarityFunctionExact")); } if (f.getMatchType().contains(MatchType.NULL_OR_BLANK)) { - addSimFunction(new CheckNullFunctionDate()); + addSimFunction(new CheckNullFunction("CheckNullFunctionDate")); } } diff --git a/common/core/src/main/java/zingg/common/core/feature/IntFeature.java b/common/core/src/main/java/zingg/common/core/feature/IntFeature.java index 3770f8d7d..a28fa2833 100644 --- a/common/core/src/main/java/zingg/common/core/feature/IntFeature.java +++ b/common/core/src/main/java/zingg/common/core/feature/IntFeature.java @@ -2,9 +2,9 @@ import zingg.common.client.FieldDefinition; import zingg.common.client.MatchType; -import zingg.common.core.similarity.function.CheckNullFunctionInt; +import zingg.common.core.similarity.function.CheckNullFunction; import zingg.common.core.similarity.function.IntegerSimilarityFunction; -import zingg.common.core.similarity.function.IntegerSimilarityFunctionExact; +import zingg.common.core.similarity.function.SimilarityFunctionExact; public class IntFeature extends BaseFeature { private static final long serialVersionUID = 1L; @@ -19,10 +19,10 @@ public void init(FieldDefinition newParam) { addSimFunction(new IntegerSimilarityFunction()); } if (newParam.getMatchType().contains(MatchType.EXACT)) { - addSimFunction(new IntegerSimilarityFunctionExact()); + addSimFunction(new SimilarityFunctionExact("IntegerSimilarityFunctionExact")); } if (newParam.getMatchType().contains(MatchType.NULL_OR_BLANK)) { - addSimFunction(new CheckNullFunctionInt()); + addSimFunction(new CheckNullFunction("CheckNullFunctionInt")); } } diff --git a/common/core/src/main/java/zingg/common/core/feature/LongFeature.java b/common/core/src/main/java/zingg/common/core/feature/LongFeature.java index 6f5cba946..81bf7261a 100644 --- a/common/core/src/main/java/zingg/common/core/feature/LongFeature.java +++ b/common/core/src/main/java/zingg/common/core/feature/LongFeature.java @@ -2,9 +2,9 @@ import zingg.common.client.FieldDefinition; import zingg.common.client.MatchType; -import zingg.common.core.similarity.function.CheckNullFunctionLong; +import zingg.common.core.similarity.function.CheckNullFunction; import zingg.common.core.similarity.function.LongSimilarityFunction; -import zingg.common.core.similarity.function.LongSimilarityFunctionExact; +import zingg.common.core.similarity.function.SimilarityFunctionExact; public class LongFeature extends BaseFeature { private static final long serialVersionUID = 1L; @@ -19,10 +19,10 @@ public void init(FieldDefinition newParam) { addSimFunction(new LongSimilarityFunction()); } if (newParam.getMatchType().contains(MatchType.EXACT)) { - addSimFunction(new LongSimilarityFunctionExact()); + addSimFunction(new SimilarityFunctionExact("LongSimilarityFunctionExact")); } if (newParam.getMatchType().contains(MatchType.NULL_OR_BLANK)) { - addSimFunction(new CheckNullFunctionLong()); + addSimFunction(new CheckNullFunction("CheckNullFunctionLong")); } } diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionLong.java b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunction.java similarity index 55% rename from common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionLong.java rename to common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunction.java index 41e753232..9a5ffc7f4 100644 --- a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionLong.java +++ b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunction.java @@ -3,22 +3,18 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -public class CheckNullFunctionLong extends SimFunction { +public class CheckNullFunction extends SimFunction { private static final long serialVersionUID = 1L; public static final Log LOG = LogFactory - .getLog(CheckNullFunctionLong.class); + .getLog(CheckNullFunction.class); - public CheckNullFunctionLong() { - super("CheckNullFunctionLong"); - } - - public CheckNullFunctionLong(String name) { + public CheckNullFunction(String name) { super(name); } @Override - public Double call(Long first, Long second) { + public Double call(T first, T second) { if (first != null && second != null) { return 1d; } diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java deleted file mode 100644 index 9d3db53b7..000000000 --- a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionDate.java +++ /dev/null @@ -1,30 +0,0 @@ -package zingg.common.core.similarity.function; - -import java.util.Date; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -public class CheckNullFunctionDate extends SimFunction { - - private static final long serialVersionUID = 1L; - public static final Log LOG = LogFactory - .getLog(CheckNullFunctionDate.class); - - public CheckNullFunctionDate() { - super("CheckNullFunctionDate"); - } - - public CheckNullFunctionDate(String name) { - super(name); - } - - @Override - public Double call(Date first, Date second) { - if (first != null && second != null) { - return 1d; - } - return 0d; - } - -} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java b/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java deleted file mode 100644 index e6e27ddb8..000000000 --- a/common/core/src/main/java/zingg/common/core/similarity/function/CheckNullFunctionInt.java +++ /dev/null @@ -1,30 +0,0 @@ -package zingg.common.core.similarity.function; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -public class CheckNullFunctionInt extends SimFunction { - - private static final long serialVersionUID = 1L; - public static final Log LOG = LogFactory - .getLog(CheckNullFunctionInt.class); - - public CheckNullFunctionInt() { - super("CheckNullFunctionInt"); - } - - public CheckNullFunctionInt(String name) { - super(name); - } - - @Override - public Double call(Integer first, Integer second) { - if (first != null && second != null) { - return 1d; - } - return 0d; - } - - - -} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java b/common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java deleted file mode 100644 index dbd8df0d0..000000000 --- a/common/core/src/main/java/zingg/common/core/similarity/function/IntegerSimilarityFunctionExact.java +++ /dev/null @@ -1,21 +0,0 @@ -package zingg.common.core.similarity.function; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -public class IntegerSimilarityFunctionExact extends SimFunction { - private static final long serialVersionUID = 1L; - public static final Log LOG = LogFactory - .getLog(IntegerSimilarityFunctionExact.class); - - public IntegerSimilarityFunctionExact() { - super("IntegerSimilarityFunctionExact"); - } - - @Override - public Double call(Integer first, Integer second) { - if (first == null || second == null) return 1d; - double score = first==second ? 1d : 0d; - return score; - } -} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java b/common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java deleted file mode 100644 index cf9c77d7a..000000000 --- a/common/core/src/main/java/zingg/common/core/similarity/function/LongSimilarityFunctionExact.java +++ /dev/null @@ -1,21 +0,0 @@ -package zingg.common.core.similarity.function; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -public class LongSimilarityFunctionExact extends SimFunction { - private static final long serialVersionUID = 1L; - public static final Log LOG = LogFactory - .getLog(LongSimilarityFunctionExact.class); - - public LongSimilarityFunctionExact() { - super("LongSimilarityFunctionExact"); - } - - @Override - public Double call(Long first, Long second) { - if (first == null || second == null) return 1d; - double score = first==second ? 1d : 0d; - return score; - } -} diff --git a/common/core/src/main/java/zingg/common/core/similarity/function/DateSimilarityFunctionExact.java b/common/core/src/main/java/zingg/common/core/similarity/function/SimilarityFunctionExact.java similarity index 57% rename from common/core/src/main/java/zingg/common/core/similarity/function/DateSimilarityFunctionExact.java rename to common/core/src/main/java/zingg/common/core/similarity/function/SimilarityFunctionExact.java index 33447c6f8..af1100eec 100644 --- a/common/core/src/main/java/zingg/common/core/similarity/function/DateSimilarityFunctionExact.java +++ b/common/core/src/main/java/zingg/common/core/similarity/function/SimilarityFunctionExact.java @@ -1,21 +1,19 @@ package zingg.common.core.similarity.function; -import java.util.Date; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -public class DateSimilarityFunctionExact extends SimFunction { +public class SimilarityFunctionExact extends SimFunction { private static final long serialVersionUID = 1L; public static final Log LOG = LogFactory - .getLog(DateSimilarityFunctionExact.class); + .getLog(SimilarityFunctionExact.class); - public DateSimilarityFunctionExact() { - super("DateSimilarityFunctionExact"); + public SimilarityFunctionExact(String name) { + super(name); } @Override - public Double call(Date first, Date second) { + public Double call(T first, T second) { if (first == null || second == null) return 1d; double score = first.equals(second) ? 1d : 0d; return score; diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java index 6a98e2dad..2f80631d6 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java @@ -10,26 +10,26 @@ public class TestCheckNullFunctionDate { @Test public void testFirstNull() { - CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); assertEquals(0d, isNull.call(null, new Date(2))); } @Test public void testSecondNull() { - CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); assertEquals(0d, isNull.call(new Date(1), null)); } @Test public void testBothNull() { - CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); assertEquals(0d, isNull.call(null, null)); } @Test public void testBothNotNull() { - CheckNullFunctionDate isNull = new CheckNullFunctionDate(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); assertEquals(1d, isNull.call(new Date(1), new Date(2))); } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java index 18ec9825c..229edcb7a 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java @@ -9,26 +9,26 @@ public class TestCheckNullFunctionInt { @Test public void testFirstNull() { - CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); assertEquals(0d, isNull.call(null, 2)); } @Test public void testSecondNull() { - CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); assertEquals(0d, isNull.call(1, null)); } @Test public void testBothNull() { - CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); assertEquals(0d, isNull.call(null, null)); } @Test public void testBothNotNull() { - CheckNullFunctionInt isNull = new CheckNullFunctionInt(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); assertEquals(1d, isNull.call(1, 2)); } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java index b61923e81..31d9cb59c 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java @@ -9,26 +9,26 @@ public class TestCheckNullFunctionLong { @Test public void testFirstNull() { - CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); assertEquals(0d, isNull.call(null, 2l)); } @Test public void testSecondNull() { - CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); assertEquals(0d, isNull.call(1l, null)); } @Test public void testBothNull() { - CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); assertEquals(0d, isNull.call(null, null)); } @Test public void testBothNotNull() { - CheckNullFunctionLong isNull = new CheckNullFunctionLong(); + CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); assertEquals(1d, isNull.call(1l, 2l)); } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java index e42831e1e..1916fafa7 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java @@ -11,32 +11,32 @@ public class TestDateSimilarityFunctionExact { @Test public void testFirstNull() { - DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); assertEquals(1d, exact.call(null, new Date(2))); } @Test public void testSecondNull() { - DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); assertEquals(1d, exact.call(new Date(1), null)); } @Test public void testBothNull() { - DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); assertEquals(1d, exact.call(null, null)); } @Test public void testNotEqual() { - DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); assertEquals(0d, exact.call(new Date(101), new Date(102))); } @Test public void testEqual() { - DateSimilarityFunctionExact exact = new DateSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); assertEquals(1d, exact.call(new Date(101), new Date(101))); } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java index 5fd391a5f..d89314046 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java @@ -9,32 +9,32 @@ public class TestIntegerSimilarityFunctionExact { @Test public void testFirstNull() { - IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); assertEquals(1d, exact.call(null, 2)); } @Test public void testSecondNull() { - IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); assertEquals(1d, exact.call(1, null)); } @Test public void testBothNull() { - IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); assertEquals(1d, exact.call(null, null)); } @Test public void testNotEqual() { - IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); assertEquals(0d, exact.call(101, 102)); } @Test public void testEqual() { - IntegerSimilarityFunctionExact exact = new IntegerSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); assertEquals(1d, exact.call(101, 101)); } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java index f44be57f8..0d8576caf 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java @@ -9,32 +9,32 @@ public class TestLongSimilarityFunctionExact { @Test public void testFirstNull() { - LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); assertEquals(1d, exact.call(null, 2l)); } @Test public void testSecondNull() { - LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); assertEquals(1d, exact.call(1l, null)); } @Test public void testBothNull() { - LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); assertEquals(1d, exact.call(null, null)); } @Test public void testNotEqual() { - LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); assertEquals(0d, exact.call(101l, 102l)); } @Test public void testEqual() { - LongSimilarityFunctionExact exact = new LongSimilarityFunctionExact(); + SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); assertEquals(1d, exact.call(101l, 101l)); } From ce80e072f7885fc10c07687495a8a7f9aa198d50 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 25 Apr 2024 08:44:37 +0530 Subject: [PATCH 130/219] refactor to common method --- .../function/TestCheckNullFunctionDate.java | 18 +++++++--------- .../function/TestCheckNullFunctionInt.java | 17 +++++++-------- .../function/TestCheckNullFunctionLong.java | 17 +++++++-------- .../TestDateSimilarityFunctionExact.java | 19 ++++++++--------- .../TestIntegerSimilarityFunctionExact.java | 21 ++++++++----------- .../TestLongSimilarityFunctionExact.java | 20 ++++++++---------- 6 files changed, 51 insertions(+), 61 deletions(-) diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java index 2f80631d6..c886b76ed 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionDate.java @@ -7,30 +7,28 @@ import org.junit.jupiter.api.Test; public class TestCheckNullFunctionDate { - @Test public void testFirstNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); - assertEquals(0d, isNull.call(null, new Date(2))); + assertEquals(0d, simFunc().call(null, new Date(2))); } - @Test public void testSecondNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); - assertEquals(0d, isNull.call(new Date(1), null)); + assertEquals(0d, simFunc().call(new Date(1), null)); } @Test public void testBothNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); - assertEquals(0d, isNull.call(null, null)); + assertEquals(0d, simFunc().call(null, null)); } @Test public void testBothNotNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionDate"); - assertEquals(1d, isNull.call(new Date(1), new Date(2))); + assertEquals(1d, simFunc().call(new Date(1), new Date(2))); + } + + protected CheckNullFunction simFunc() { + return new CheckNullFunction("CheckNullFunctionDate"); } } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java index 229edcb7a..144fc5fa6 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionInt.java @@ -9,27 +9,26 @@ public class TestCheckNullFunctionInt { @Test public void testFirstNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); - assertEquals(0d, isNull.call(null, 2)); + assertEquals(0d, simFunc().call(null, 2)); } - @Test public void testSecondNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); - assertEquals(0d, isNull.call(1, null)); + assertEquals(0d, simFunc().call(1, null)); } @Test public void testBothNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); - assertEquals(0d, isNull.call(null, null)); + assertEquals(0d, simFunc().call(null, null)); } @Test public void testBothNotNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionInt"); - assertEquals(1d, isNull.call(1, 2)); + assertEquals(1d, simFunc().call(1, 2)); + } + + protected CheckNullFunction simFunc() { + return new CheckNullFunction("CheckNullFunctionInt"); } } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java index 31d9cb59c..a7712d074 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestCheckNullFunctionLong.java @@ -9,27 +9,26 @@ public class TestCheckNullFunctionLong { @Test public void testFirstNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); - assertEquals(0d, isNull.call(null, 2l)); + assertEquals(0d, simFunc().call(null, 2l)); } - @Test public void testSecondNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); - assertEquals(0d, isNull.call(1l, null)); + assertEquals(0d, simFunc().call(1l, null)); } @Test public void testBothNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); - assertEquals(0d, isNull.call(null, null)); + assertEquals(0d, simFunc().call(null, null)); } @Test public void testBothNotNull() { - CheckNullFunction isNull = new CheckNullFunction("CheckNullFunctionLong"); - assertEquals(1d, isNull.call(1l, 2l)); + assertEquals(1d, simFunc().call(1l, 2l)); + } + + protected CheckNullFunction simFunc() { + return new CheckNullFunction("CheckNullFunctionLong"); } } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java index 1916fafa7..56f815291 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestDateSimilarityFunctionExact.java @@ -11,33 +11,32 @@ public class TestDateSimilarityFunctionExact { @Test public void testFirstNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); - assertEquals(1d, exact.call(null, new Date(2))); + assertEquals(1d, simFunc().call(null, new Date(2))); } @Test public void testSecondNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); - assertEquals(1d, exact.call(new Date(1), null)); + assertEquals(1d, simFunc().call(new Date(1), null)); } @Test public void testBothNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); - assertEquals(1d, exact.call(null, null)); + assertEquals(1d, simFunc().call(null, null)); } @Test public void testNotEqual() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); - assertEquals(0d, exact.call(new Date(101), new Date(102))); + assertEquals(0d, simFunc().call(new Date(101), new Date(102))); } @Test public void testEqual() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("DateSimilarityFunctionExact"); - assertEquals(1d, exact.call(new Date(101), new Date(101))); + assertEquals(1d, simFunc().call(new Date(101), new Date(101))); + } + + protected SimilarityFunctionExact simFunc() { + return new SimilarityFunctionExact("DateSimilarityFunctionExact"); } } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java index d89314046..37e1b415b 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestIntegerSimilarityFunctionExact.java @@ -6,36 +6,33 @@ public class TestIntegerSimilarityFunctionExact { - @Test public void testFirstNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); - assertEquals(1d, exact.call(null, 2)); + assertEquals(1d, simFunc().call(null, 2)); } - @Test public void testSecondNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); - assertEquals(1d, exact.call(1, null)); + assertEquals(1d, simFunc().call(1, null)); } @Test public void testBothNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); - assertEquals(1d, exact.call(null, null)); + assertEquals(1d, simFunc().call(null, null)); } @Test public void testNotEqual() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); - assertEquals(0d, exact.call(101, 102)); + assertEquals(0d, simFunc().call(101, 102)); } @Test public void testEqual() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); - assertEquals(1d, exact.call(101, 101)); + assertEquals(1d, simFunc().call(101, 101)); + } + + protected SimilarityFunctionExact simFunc() { + return new SimilarityFunctionExact("IntegerSimilarityFunctionExact"); } } diff --git a/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java b/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java index 0d8576caf..ee8808259 100644 --- a/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java +++ b/common/core/src/test/java/zingg/common/core/similarity/function/TestLongSimilarityFunctionExact.java @@ -9,33 +9,31 @@ public class TestLongSimilarityFunctionExact { @Test public void testFirstNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); - assertEquals(1d, exact.call(null, 2l)); + assertEquals(1d, simFunc().call(null, 2l)); } - @Test public void testSecondNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); - assertEquals(1d, exact.call(1l, null)); + assertEquals(1d, simFunc().call(1l, null)); } @Test public void testBothNull() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); - assertEquals(1d, exact.call(null, null)); + assertEquals(1d, simFunc().call(null, null)); } @Test public void testNotEqual() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); - assertEquals(0d, exact.call(101l, 102l)); + assertEquals(0d, simFunc().call(101l, 102l)); } @Test public void testEqual() { - SimilarityFunctionExact exact = new SimilarityFunctionExact("LongSimilarityFunctionExact"); - assertEquals(1d, exact.call(101l, 101l)); + assertEquals(1d, simFunc().call(101l, 101l)); } + protected SimilarityFunctionExact simFunc() { + return new SimilarityFunctionExact("LongSimilarityFunctionExact"); + } + } From 7b4e7811719c11ab6f3236fa0b60aafcdbcc8599 Mon Sep 17 00:00:00 2001 From: SemyonSinchenko Date: Wed, 1 May 2024 16:03:44 +0200 Subject: [PATCH 131/219] Semi-working version of Connect On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: .gitignore modified: python/requirements.txt modified: python/test_spark_connect.py modified: python/zingg_v2/client.py modified: python/zingg_v2/errors.py modified: scripts/run-spark-connect-local.sh modified: spark/client/pom.xml modified: spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala modified: spark/pom.xml --- .gitignore | 10 +++++++ python/requirements.txt | 3 ++- python/test_spark_connect.py | 1 + python/zingg_v2/client.py | 27 ++++++++++++------- python/zingg_v2/errors.py | 3 +++ scripts/run-spark-connect-local.sh | 5 ++-- spark/client/pom.xml | 13 +++++++++ .../spark/connect/ZinggConnectPlugin.scala | 1 + spark/pom.xml | 20 -------------- 9 files changed, 49 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index c75e61ded..e7f285da9 100644 --- a/.gitignore +++ b/.gitignore @@ -34,9 +34,19 @@ python/docs/_build/_doctrees # Helix stuff .helix +# Emacs stuff +.dir-locals.el + # JDTLS stuff .package .classpath .project .settings .factorypath + +# Metals LSP +.metals +.bloop + +# Hadoop & Spark binaries +spark-* \ No newline at end of file diff --git a/python/requirements.txt b/python/requirements.txt index 0d786b720..42c95be15 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -3,4 +3,5 @@ seaborn matplotlib sphinx sphinx-rtd-theme -pyspark>=3.5 +pyspark[connect]>=3.5 +pydantic diff --git a/python/test_spark_connect.py b/python/test_spark_connect.py index 6221d4bc3..1fdf00606 100644 --- a/python/test_spark_connect.py +++ b/python/test_spark_connect.py @@ -4,6 +4,7 @@ if __name__ == "__main__": spark = SparkSession.builder.remote("sc://localhost").getOrCreate() + print(hasattr(spark, "_jvm")) opts = ClientOptions(None) args = Arguments.createArgumentsFromJSON(fileName="../examples/febrl/config.json", phase="peekModel") zingg = Zingg(args=args, options=opts) diff --git a/python/zingg_v2/client.py b/python/zingg_v2/client.py index f30f78240..32edda143 100644 --- a/python/zingg_v2/client.py +++ b/python/zingg_v2/client.py @@ -12,7 +12,7 @@ from zingg_v2 import models as models_v2 from zingg_v2.connect import ZinggJob -from zingg_v2.errors import ZinggArgumentsValidationError +from zingg_v2.errors import ZinggArgumentsValidationError, ZinggSparkConnectEmptySession from zingg_v2.pipes import Pipe @@ -20,13 +20,19 @@ class Zingg: def __init__(self, args: Arguments, options: ClientOptions) -> None: self.args = args self.options = options - self.spark: Union[SparkSession, ConnectSession] = SparkSession.getActiveSession() - - if self.spark is None: - _warn_msg = "Spark Session is not initialized in the current thread!" - _warn_msg += " It is strongly reccomend to init SparkSession manually!" - warnings.warn(_warn_msg) - self.spark = SparkSession.builder.getOrCreate() + if os.environ["ZINGG_SPARK_CONNECT"]: + self.spark = ConnectSession.getActiveSession() + if self.spark is None: + _err_msg = "SparkConnect mode was choosen but spark session was not created!" + _err_msg += "\nYou have to initialize SparkConnectSession before creating Zingg!" + raise ZinggSparkConnectEmptySession(_err_msg) + else: + self.spark = SparkSession.getActiveSession() + if self.spark is None: + _warn_msg = "Spark Session is not initialized in the current thread!" + _warn_msg += " It is strongly reccomend to init SparkSession manually!" + warnings.warn(_warn_msg) + self.spark = SparkSession.builder.getOrCreate() def execute(self) -> Zingg: # TODO: implement it @@ -37,9 +43,9 @@ def execute(self) -> Zingg: # java_job_definition is JSON definition of Zingg Job java_job_definition = self.args.writeArgumentsToJSONString() - spark_connect = hasattr(self.spark, "_jvm") + spark_connect = not hasattr(self.spark, "_jvm") - if not spark_connect: + if spark_connect: _log_msg = "Submitting a Zingg Job\n" _log_msg += f"Arguments: {java_args}\n\n" _log_msg += java_job_definition @@ -53,6 +59,7 @@ def execute(self) -> Zingg: new_args: str = output["newArgs"] else: + # There are errors that should be fixed! :TODO # TODO: Put that logic into Java by creating an entry point for Python API? j_options = self.spark._jvm.zingg.common.client.ClientOptions(java_args) j_args = self.spark._jvm.zingg.common.client.ArgumentsUtil.createArgumentsFromJSONString( diff --git a/python/zingg_v2/errors.py b/python/zingg_v2/errors.py index dc0658031..4ba7794ff 100644 --- a/python/zingg_v2/errors.py +++ b/python/zingg_v2/errors.py @@ -3,3 +3,6 @@ class ZinggArgumentsValidationError(ValueError): class ZinggParameterIsNotSet(ValueError): pass + +class ZinggSparkConnectEmptySession(ValueError): + pass diff --git a/scripts/run-spark-connect-local.sh b/scripts/run-spark-connect-local.sh index f259ebb6f..87572659a 100644 --- a/scripts/run-spark-connect-local.sh +++ b/scripts/run-spark-connect-local.sh @@ -1,8 +1,7 @@ #!/usr/bin/bash - spark-3.5.1-bin-hadoop3/sbin/start-connect-server.sh \ - --wait \ + spark-3.5.1-bin-hadoop3/sbin/start-connect-server.sh --wait \ --verbose \ --jars assembly/target/zingg-0.4.0.jar \ - --conf spark.connect.extensions.relation.classes=zingg.spark.connect.ZinggSparkConnectPlugin \ + --conf spark.connect.extensions.relation.classes=zingg.spark.connect.ZinggConnectPlugin \ --packages org.apache.spark:spark-connect_2.12:3.5.1 diff --git a/spark/client/pom.xml b/spark/client/pom.xml index 418abd6e8..19dda6ba6 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -51,6 +51,19 @@ ${basedir}/src/main/java/zingg/client + + net.alchim31.maven + scala-maven-plugin + 4.8.1 + + + + compile + testCompile + + + + diff --git a/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala b/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala index 3a0145faa..4f55a94db 100644 --- a/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala +++ b/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala @@ -6,6 +6,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.connect.planner.SparkConnectPlanner import org.apache.spark.sql.connect.plugin.RelationPlugin import org.apache.spark.sql.types.{StringType, StructField, StructType} + import zingg.common.client.{ArgumentsUtil, ClientOptions} import zingg.spark.client.SparkClient import zingg.spark.connect.proto.SubmitZinggJob diff --git a/spark/pom.xml b/spark/pom.xml index 00174b047..95f480484 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -62,24 +62,4 @@ compile - - - - - net.alchim31.maven - scala-maven-plugin - 4.8.1 - - - - compile - testCompile - - - - - - - - From 0ebdc846fbfb0b97f07c1ee2ded4ac80720a6ca0 Mon Sep 17 00:00:00 2001 From: semyonsinchenko Date: Sun, 5 May 2024 13:08:43 +0200 Subject: [PATCH 132/219] Switch to 4.0 + drop scala from spark-client + rewrite plugin in java + update to scala 2.13 and corresponding fixes + small changes On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: pom.xml modified: spark/client/pom.xml new file: spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java deleted: spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala modified: spark/core/src/main/java/zingg/spark/core/block/SparkBlockFunction.java modified: spark/core/src/test/java/zingg/TestUDFDoubleWrappedArr.java --- pom.xml | 25 +++++++- spark/client/pom.xml | 13 ---- .../spark/connect/ZinggConnectPlugin.java | 64 +++++++++++++++++++ .../spark/connect/ZinggConnectPlugin.scala | 43 ------------- .../spark/core/block/SparkBlockFunction.java | 4 +- .../java/zingg/TestUDFDoubleWrappedArr.java | 6 +- 6 files changed, 92 insertions(+), 63 deletions(-) create mode 100644 spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java delete mode 100644 spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala diff --git a/pom.xml b/pom.xml index d4370e1e2..111119f49 100644 --- a/pom.xml +++ b/pom.xml @@ -48,7 +48,7 @@ spark-3.5 - true + false spark 3.5 @@ -62,6 +62,23 @@ 0.8.3-spark3.5-s_2.12
+ + spark-4.0 + + true + + spark + 4.0 + + + + 4.0.0-SNAPSHOT + 2.13.13 + 4.0 + 2.13 + 0.8.3-spark3.5-s_2.13 + + 0.4.0 @@ -89,7 +106,11 @@ SparkPackagesRepo https://repos.spark-packages.org/ - + + + Apache Snapshots + https://repository.apache.org/snapshots/ + diff --git a/spark/client/pom.xml b/spark/client/pom.xml index 19dda6ba6..418abd6e8 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -51,19 +51,6 @@ ${basedir}/src/main/java/zingg/client - - net.alchim31.maven - scala-maven-plugin - 4.8.1 - - - - compile - testCompile - - - - diff --git a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java new file mode 100644 index 000000000..e65f1941f --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java @@ -0,0 +1,64 @@ +package zingg.spark.connect; + +import com.google.protobuf.Any; +import com.google.protobuf.InvalidProtocolBufferException; +import com.sun.tools.javac.util.List; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; +import org.apache.spark.sql.connect.planner.SparkConnectPlanner; +import org.apache.spark.sql.connect.plugin.RelationPlugin; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import zingg.common.client.*; +import zingg.spark.client.SparkClient; +import zingg.spark.connect.proto.SubmitZinggJob; + +import java.util.Optional; + +public class ZinggConnectPlugin implements RelationPlugin { + @Override + public Optional transform(byte[] bytes, SparkConnectPlanner sparkConnectPlanner) { + Any command; + try { + command = Any.parseFrom(bytes); + if (!command.is(SubmitZinggJob.class)) { + return Optional.empty(); + } else { + try (SparkSession session = sparkConnectPlanner.sessionHolder().session()) { + SubmitZinggJob request = command.unpack(SubmitZinggJob.class); + String options = request.getOptions(); + String args = request.getArgs(); + ClientOptions clientOptions = new ClientOptions(options); + IArguments arguments = new ArgumentsUtil() + .createArgumentsFromJSONString(args, clientOptions.getOptionValue(ClientOptions.PHASE)); + SparkClient client = new SparkClient(arguments, clientOptions, session); + client.init(); + client.execute(); + client.postMetrics(); + + Dataset outDF = session.createDataFrame( + List.of( + RowFactory.create( + "SUCCESS", + new ArgumentsUtil().writeArgumentstoJSONString(client.getArguments()) + ) + ), + new StructType(new StructField[]{ + DataTypes.createStructField("status", DataTypes.StringType, false), + DataTypes.createStructField("newArgs", DataTypes.StringType, false) + }) + ); + return Optional.of(outDF.logicalPlan()); + } + } + } catch (InvalidProtocolBufferException e) { + throw new RuntimeException("Protobuf exception in SparkConnect", e); + } catch (ZinggClientException e) { + throw new RuntimeException("Zingg Internal Error", e); + } + } +} diff --git a/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala b/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala deleted file mode 100644 index 4f55a94db..000000000 --- a/spark/client/src/main/scala/zingg/spark/connect/ZinggConnectPlugin.scala +++ /dev/null @@ -1,43 +0,0 @@ -package zingg.spark.connect - -import com.google.protobuf -import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.connect.planner.SparkConnectPlanner -import org.apache.spark.sql.connect.plugin.RelationPlugin -import org.apache.spark.sql.types.{StringType, StructField, StructType} - -import zingg.common.client.{ArgumentsUtil, ClientOptions} -import zingg.spark.client.SparkClient -import zingg.spark.connect.proto.SubmitZinggJob - -import scala.collection.JavaConversions._ - -class ZinggConnectPlugin extends RelationPlugin { - override def transform(relation: protobuf.Any, planner: SparkConnectPlanner): Option[LogicalPlan] = { - if (!relation.is(classOf[SubmitZinggJob])) { - Option.empty - } else { - val message = relation.unpack(classOf[SubmitZinggJob]) - val spark = planner.sessionHolder.session - val options = new ClientOptions(message.getOptions) - val args = new ArgumentsUtil().createArgumentsFromJSONString(message.getArgs, options.getOptionValue(ClientOptions.PHASE)) - val client = new SparkClient(args, options, spark) - client.init() - client.execute() - client.postMetrics() - - val outDf = spark.createDataFrame( - Seq(Row("SUCEESS", new ArgumentsUtil().writeArgumentstoJSONString(client.getArguments))), - StructType( - Seq( - StructField("status", StringType, nullable = false), - StructField("newArgs", StringType, nullable = false) - ) - ) - ) - - Option(outDf.queryExecution.logical) - } - } -} diff --git a/spark/core/src/main/java/zingg/spark/core/block/SparkBlockFunction.java b/spark/core/src/main/java/zingg/spark/core/block/SparkBlockFunction.java index 0e0dce39e..c80a0348c 100644 --- a/spark/core/src/main/java/zingg/spark/core/block/SparkBlockFunction.java +++ b/spark/core/src/main/java/zingg/spark/core/block/SparkBlockFunction.java @@ -7,7 +7,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import scala.collection.JavaConversions; +import scala.jdk.CollectionConverters; import scala.collection.Seq; import zingg.common.core.block.BlockFunction; import zingg.common.core.block.Canopy; @@ -23,7 +23,7 @@ public SparkBlockFunction(Tree> tree) { @Override public List getListFromRow(Row r) { Seq sObj = r.toSeq(); - List seqList = JavaConversions.seqAsJavaList(sObj); + List seqList = CollectionConverters.SeqHasAsJava(sObj).asJava(); //the abstract list returned here does not support adding a new element, //so an ugly way is to create a new list altogether (!!) //see in perf - maybe just iterate over all the row elements and add the last one? diff --git a/spark/core/src/test/java/zingg/TestUDFDoubleWrappedArr.java b/spark/core/src/test/java/zingg/TestUDFDoubleWrappedArr.java index 345362fb1..cd19368ee 100644 --- a/spark/core/src/test/java/zingg/TestUDFDoubleWrappedArr.java +++ b/spark/core/src/test/java/zingg/TestUDFDoubleWrappedArr.java @@ -2,15 +2,15 @@ import org.apache.spark.sql.api.java.UDF2; -import scala.collection.mutable.WrappedArray; +import scala.collection.mutable.ArraySeq; import zingg.common.core.similarity.function.ArrayDoubleSimilarityFunction; -public class TestUDFDoubleWrappedArr implements UDF2,WrappedArray, Double>{ +public class TestUDFDoubleWrappedArr implements UDF2,ArraySeq, Double>{ private static final long serialVersionUID = 1L; @Override - public Double call(WrappedArray t1, WrappedArray t2) throws Exception { + public Double call(ArraySeq t1, ArraySeq t2) throws Exception { System.out.println("TestUDFDoubleWrappedArr class" +t1.getClass()); Double[] t1Arr = new Double[t1.length()]; From 1410583a7e638a33347d061651cbba5767df51a7 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 7 May 2024 16:46:04 +0530 Subject: [PATCH 133/219] remove unwanted checked exceptions --- .../zingg/common/client/event/listeners/EventsListener.java | 5 ++--- .../zingg/common/client/event/listeners/IEventListener.java | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java index 42fccc7f9..df4bd73a6 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/EventsListener.java @@ -2,7 +2,6 @@ import java.util.List; -import zingg.common.client.ZinggClientException; import zingg.common.client.event.events.IEvent; import zingg.common.client.util.ListMap; @@ -22,11 +21,11 @@ public void addListener(Class eventClass, IEventListener liste eventListenersList.add(eventClass.getCanonicalName(), listener); } - public void fireEvent(IEvent event) throws ZinggClientException { + public void fireEvent(IEvent event) { listen(event); } - private void listen(IEvent event) throws ZinggClientException { + private void listen(IEvent event) { Class eventClass = event.getClass(); List listenerList = eventListenersList.get(eventClass.getCanonicalName()); if (listenerList != null) { diff --git a/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java b/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java index 756eea766..5f45e5082 100644 --- a/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java +++ b/common/client/src/main/java/zingg/common/client/event/listeners/IEventListener.java @@ -1,11 +1,10 @@ package zingg.common.client.event.listeners; -import zingg.common.client.ZinggClientException; import zingg.common.client.event.events.IEvent; public class IEventListener { - public void listen(IEvent event) throws ZinggClientException { + public void listen(IEvent event) { } } From 96159e4106c9d09e85a37cbb264add789e08d193 Mon Sep 17 00:00:00 2001 From: Sonal Date: Mon, 3 Jun 2024 09:46:20 +0530 Subject: [PATCH 134/219] Update faq.md with cdp and mdm information --- docs/faq.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 1663a624f..9f30aaa9e 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -22,11 +22,13 @@ Very much! Zingg uses Spark and ML under the hood so that you don't have to worr ## Is Zingg an MDM? -No, Zingg is not an MDM. An MDM is the system of record, it has its own store where linked and mastered records are saved. Zingg enables MDM but is not a system of record. You can build an MDM in a data store of your choice using Zingg however. +An MDM is the system of record, it has its own store where linked and mastered records are saved. Zingg Community Version is not a complete MDM but it can be sed to build an MDM. You can build an MDM in a data store of your choice using Zingg Community Version. Zingg Enterprise Version is a lakehouse/warehouse native MDM. ## Is Zingg a CDP ? -No, Zingg is not a CDP, as it does not stream events or customer data through different channels. Zingg does overlap with the CDPs identity resolution and building customer 360 views. Here is an [article](https://hightouch.com/blog/warehouse-identity-resolution/) describing how you can build your own CDP on the warehouse with Zingg. +No, Zingg is not a CDP, as it does not stream events or customer data through different channels. However, if you want to base your customer platform off your warehouse or datalake, Zing gis a great fit. You can leverage existing ETL, observability and other tools which are part of your data stack and use Zingg for identity. +Zingg Comminity Version can be used to build a composable CDP by identity resolution natively on the warehouse and datalake and building customer 360 views. Zingg's identity resolution is far more powerful than what is offered by any out of the box CDP. +Zingg Enterprise's probabilistic and deterministic matching take this further beyond. Here is an [article](https://hightouch.com/blog/warehouse-identity-resolution/) describing how you can build your own CDP on the warehouse with Zingg. ## I can do Entity Resolution using a graph database like TigerGraph/Neo4J, why do I need Zingg ? From 4399f4aba59cea6840d0912a522c25dc33f918ba Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Fri, 21 Jun 2024 00:25:23 +0530 Subject: [PATCH 135/219] updated jdk to 11 to stop the issue with codegen --- log4j2.properties | 4 ++++ pom.xml | 4 ++-- spark/pom.xml | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/log4j2.properties b/log4j2.properties index 6a7dbc16a..f007411fd 100644 --- a/log4j2.properties +++ b/log4j2.properties @@ -49,3 +49,7 @@ logger.zingg.name = zingg logger.zingg.level = info logger.zingg_analytics.name = zingg.common.core.util.Analytics logger.zingg_analytics.level = off +logger.codegen.name = org.apache.spark.sql.catalyst.expressions +logger.codegen.level = OFF +logger.codehaus.name = org.codehaus +logger.codehaus.level = OFF diff --git a/pom.xml b/pom.xml index cd421cc5f..c63f6ffcc 100644 --- a/pom.xml +++ b/pom.xml @@ -67,8 +67,8 @@ 0.4.1-SNAPSHOT false false - 8 - 8 + 11 + 11 UTF-8 2.10 2.5.2 diff --git a/spark/pom.xml b/spark/pom.xml index 2ea784073..e070f5c6e 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -42,6 +42,12 @@ graphframes graphframes ${graphframes.version} + + + org.slf4j + slf4j-api + + From 673e63edb77e8e40b6d01f620761d107b1df88f8 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sat, 22 Jun 2024 08:04:23 +0530 Subject: [PATCH 136/219] repartition to take multiple columns --- common/client/src/main/java/zingg/common/client/ZFrame.java | 2 +- spark/client/src/main/java/zingg/spark/client/SparkFrame.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ZFrame.java b/common/client/src/main/java/zingg/common/client/ZFrame.java index 6ffa25a5a..11f0b0d89 100644 --- a/common/client/src/main/java/zingg/common/client/ZFrame.java +++ b/common/client/src/main/java/zingg/common/client/ZFrame.java @@ -80,7 +80,7 @@ public interface ZFrame { public ZFrame repartition(int num); - public ZFrame repartition(int num, C c); + public ZFrame repartition(int num, C... c); public ZFrame sample(boolean repartition, float num); public ZFrame sample(boolean repartition, double num); diff --git a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java index ce20ea9a5..e3d8896bf 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java @@ -215,7 +215,7 @@ public ZFrame, Row, Column> repartition(int nul){ return new SparkFrame(df.repartition(nul)); } - public ZFrame, Row, Column> repartition(int nul, Column c){ + public ZFrame, Row, Column> repartition(int nul, Column... c){ return new SparkFrame(df.repartition(nul, c)); } From 5288e84759de01d5e28e56686b2153f79dbf1c8b Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Sat, 22 Jun 2024 16:14:54 +0530 Subject: [PATCH 137/219] repartition with seq of cols --- common/client/src/main/java/zingg/common/client/ZFrame.java | 4 +++- .../client/src/main/java/zingg/spark/client/SparkFrame.java | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ZFrame.java b/common/client/src/main/java/zingg/common/client/ZFrame.java index 11f0b0d89..37de2b0db 100644 --- a/common/client/src/main/java/zingg/common/client/ZFrame.java +++ b/common/client/src/main/java/zingg/common/client/ZFrame.java @@ -80,7 +80,9 @@ public interface ZFrame { public ZFrame repartition(int num); - public ZFrame repartition(int num, C... c); + public ZFrame repartition(int num, C c); + public ZFrame repartition(int num,scala.collection.Seq partitionExprs); + public ZFrame sample(boolean repartition, float num); public ZFrame sample(boolean repartition, double num); diff --git a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java index e3d8896bf..c2984868a 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java @@ -215,10 +215,14 @@ public ZFrame, Row, Column> repartition(int nul){ return new SparkFrame(df.repartition(nul)); } - public ZFrame, Row, Column> repartition(int nul, Column... c){ + public ZFrame, Row, Column> repartition(int nul, Column c){ return new SparkFrame(df.repartition(nul, c)); } + public ZFrame, Row, Column> repartition(int num,scala.collection.Seq partitionExprs){ + return new SparkFrame(df.repartition(num, partitionExprs)); + } + @Override public Column gt(String c) { return gt(this,c); From ba27988d308f3178865d49c343381f004201de09 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 1 Jul 2024 20:44:00 +0530 Subject: [PATCH 138/219] removed redundant method and made pair class pluggable --- .../main/java/zingg/common/core/executor/Linker.java | 11 +++++------ .../main/java/zingg/common/core/executor/Matcher.java | 4 ---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/executor/Linker.java b/common/core/src/main/java/zingg/common/core/executor/Linker.java index d63526410..c271a2161 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Linker.java +++ b/common/core/src/main/java/zingg/common/core/executor/Linker.java @@ -27,17 +27,16 @@ public ZFrame selectColsFromBlocked(ZFrame blocked) { return blocked; } - @Override - public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Exception{ - return getPairs(blocked, bAll, new SelfPairBuilderSourceSensitive (getDSUtil(),args)); - } - @Override protected ZFrame getActualDupes(ZFrame blocked, ZFrame testData) throws Exception, ZinggClientException{ PredictionFilter predictionFilter = new PredictionFilter(); - SelfPairBuilderSourceSensitive iPairBuilder = new SelfPairBuilderSourceSensitive (getDSUtil(),args); + SelfPairBuilderSourceSensitive iPairBuilder = getPairBuilderSourceSensitive(); return getActualDupes(blocked, testData,predictionFilter, iPairBuilder, null); } + + protected SelfPairBuilderSourceSensitive getPairBuilderSourceSensitive() { + return new SelfPairBuilderSourceSensitive (getDSUtil(),args); + } @Override public void writeOutput(ZFrame sampleOrginal, ZFrame dupes) throws ZinggClientException { diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 483059c4d..2e976eae9 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -53,10 +53,6 @@ public ZFrame getBlocked( ZFrame testData) throws Exception, Zin return blocked1; } - public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws Exception{ - return getPairs(blocked, bAll, new SelfPairBuilder (getDSUtil(),args)); - } - public ZFrame getPairs(ZFrameblocked, ZFramebAll, IPairBuilder iPairBuilder) throws Exception{ return iPairBuilder.getPairs(blocked, bAll); } From 2ae6867d23ad85668c46f051f2369885f2fe8875 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Tue, 2 Jul 2024 18:22:02 +0530 Subject: [PATCH 139/219] Update README.md docker /tmp mapping to user's machine /tmp --- docs/stepbystep/installation/docker/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/stepbystep/installation/docker/README.md b/docs/stepbystep/installation/docker/README.md index 4cc395341..afce0e9ac 100644 --- a/docs/stepbystep/installation/docker/README.md +++ b/docs/stepbystep/installation/docker/README.md @@ -12,6 +12,10 @@ The easiest way to get started is to pull the Docker image with the last release docker pull zingg/zingg:0.4.1-SNAPSHOT docker run -it zingg/zingg:0.4.1-SNAPSHOT bash ``` +In case of permission denied, try mapping /tmp of docker with user's machine /tmp +``` +docker run -v /tmp:/tmp -it zingg/zingg:0.4.0 bash +``` To know more about Docker, please refer to the official [docker documentation](https://docs.docker.com/). From d9754396187779ed3104a5873b13b4443eac38ca Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Tue, 2 Jul 2024 18:40:52 +0530 Subject: [PATCH 140/219] Update match.md view match results in console --- docs/setup/match.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/setup/match.md b/docs/setup/match.md index 050d95fa4..aedadd5b3 100644 --- a/docs/setup/match.md +++ b/docs/setup/match.md @@ -14,4 +14,17 @@ As can be seen in the image below, matching records are given the same z_cluster ![Match results](/assets/match.gif) -If records across multiple sources have to be matched, the [link phase](./link.md) should be used. \ No newline at end of file +The match results in CSV fromat will be saved in /tmp/zinggOutput. +To view top 100 match results sorted by z_cluster in console, use +``` +$ pyspark +_oss = spark.read.option("header", True).csv('/tmp/zinggOutput') +3:50 +z_oss = z_oss.sort('z_cluster') +3:50 +z_oss.count() +3:51 +z_oss.show(100) +``` + +If records across multiple sources have to be matched, the [link phase](./link.md) should be used. From 422b9bb76b101b92ef9f376f7d2e293bc02adb79 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Tue, 2 Jul 2024 18:42:14 +0530 Subject: [PATCH 141/219] Update match.md view match results in console --- docs/setup/match.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/setup/match.md b/docs/setup/match.md index aedadd5b3..6087b33a2 100644 --- a/docs/setup/match.md +++ b/docs/setup/match.md @@ -14,7 +14,7 @@ As can be seen in the image below, matching records are given the same z_cluster ![Match results](/assets/match.gif) -The match results in CSV fromat will be saved in /tmp/zinggOutput. +The match results in CSV format will be saved in /tmp/zinggOutput. To view top 100 match results sorted by z_cluster in console, use ``` $ pyspark From 6aa153140856e8a0ca0a0ec67295ade3ffb9fce9 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Tue, 2 Jul 2024 18:49:23 +0530 Subject: [PATCH 142/219] Update match.md --- docs/setup/match.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/setup/match.md b/docs/setup/match.md index 6087b33a2..e1ac81311 100644 --- a/docs/setup/match.md +++ b/docs/setup/match.md @@ -14,7 +14,7 @@ As can be seen in the image below, matching records are given the same z_cluster ![Match results](/assets/match.gif) -The match results in CSV format will be saved in /tmp/zinggOutput. +The match results in CSV format will be saved in '/tmp/zinggOutput'. To view top 100 match results sorted by z_cluster in console, use ``` $ pyspark From 248b6985d691d0b5dc5f4a8c33583925225688dd Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Wed, 3 Jul 2024 15:54:32 +0530 Subject: [PATCH 143/219] Update settingUpZingg.md updated env variables part --- docs/settingUpZingg.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/settingUpZingg.md b/docs/settingUpZingg.md index 9f1f3982a..e87f9de90 100644 --- a/docs/settingUpZingg.md +++ b/docs/settingUpZingg.md @@ -92,9 +92,11 @@ vim ~/.bashrc export SPARK_HOME=/opt/spark export SPARK_MASTER=local[\*] export MAVEN_HOME=/home/ubuntu/apache-maven-3.8.8 -export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin:$MAVEN_HOME/bin -export ZINGG_HOME=/zingg/assembly/target -export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +export ZINGG_HOME=/home/administrator/Zingg/zingg-Nitish/assembly/target +export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 +export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin:$JAVA_HOME/bin + +Skip adding export MAVEN_HOME=/home/ubuntu/apache-maven-3.8.8, if multiple maven version are not required Save/exit and do source .bashrc so that they reflect From f1d502461330d18c6c19afe76d1dedb2f224c6b4 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Wed, 3 Jul 2024 15:57:03 +0530 Subject: [PATCH 144/219] Update settingUpZingg.md updated jdk 1.8 to jdk 11 --- docs/settingUpZingg.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/settingUpZingg.md b/docs/settingUpZingg.md index e87f9de90..18da3e08b 100644 --- a/docs/settingUpZingg.md +++ b/docs/settingUpZingg.md @@ -35,13 +35,13 @@ _**Note :-**_ It is suggested to fork the repository to your account and then cl **** -_**Step 2 : Install JDK 1.8 (Java Development Kit)**_ +_**Step 2 : Install JDK 11 (Java Development Kit)**_ -* Follow this [tutorial](https://linuxize.com/post/install-java-on-ubuntu-20-04/) to install Java8 JDK1.8 in Ubuntu. +* Follow this [tutorial](https://linuxize.com/post/install-java-on-ubuntu-20-04/) to install Java11 JDK11 in Ubuntu. * For example: ``` -sudo apt install openjdk-8-jdk openjdk-8-jre +sudo apt install openjdk-11-jdk openjdk-11-jre javac -version java -version ``` From 61d13c579b19ef81caa3cf109f273cba40468f40 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Wed, 3 Jul 2024 15:59:53 +0530 Subject: [PATCH 145/219] Update settingUpZingg.md zingg repo path --- docs/settingUpZingg.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/settingUpZingg.md b/docs/settingUpZingg.md index 18da3e08b..3508f4158 100644 --- a/docs/settingUpZingg.md +++ b/docs/settingUpZingg.md @@ -92,11 +92,12 @@ vim ~/.bashrc export SPARK_HOME=/opt/spark export SPARK_MASTER=local[\*] export MAVEN_HOME=/home/ubuntu/apache-maven-3.8.8 -export ZINGG_HOME=/home/administrator/Zingg/zingg-Nitish/assembly/target +export ZINGG_HOME=/assembly/target export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin:$JAVA_HOME/bin -Skip adding export MAVEN_HOME=/home/ubuntu/apache-maven-3.8.8, if multiple maven version are not required + is a path where zingg repo has been cloned +**Skip adding export MAVEN_HOME=/home/ubuntu/apache-maven-3.8.8, if multiple maven version are not required Save/exit and do source .bashrc so that they reflect From f5d96d79d340cdfb3d66ffc1672f5dc0362f7806 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Wed, 3 Jul 2024 16:05:21 +0530 Subject: [PATCH 146/219] Update settingUpZingg.md added comments --- docs/settingUpZingg.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/settingUpZingg.md b/docs/settingUpZingg.md index 3508f4158..2df515aad 100644 --- a/docs/settingUpZingg.md +++ b/docs/settingUpZingg.md @@ -79,6 +79,11 @@ rm -rf apache-maven-3.8.8-bin.tar.gz cd apache-maven-3.8.8/ cd bin ./mvn --version + +Make sure that mvn -version should display correct java version as well(JAVA 11) +Apache Maven 3.8.7 +Maven home: /usr/share/maven +Java version: 11.0.23, vendor: Ubuntu, runtime: /usr/lib/jvm/java-11-openjdk-amd64 ``` **** From a4b1ed18814609c4c3a9bd03698f03488c5af624 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Wed, 3 Jul 2024 16:17:30 +0530 Subject: [PATCH 147/219] Update match.md reverted back --- docs/setup/match.md | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/docs/setup/match.md b/docs/setup/match.md index e1ac81311..0e1fd35d7 100644 --- a/docs/setup/match.md +++ b/docs/setup/match.md @@ -14,17 +14,4 @@ As can be seen in the image below, matching records are given the same z_cluster ![Match results](/assets/match.gif) -The match results in CSV format will be saved in '/tmp/zinggOutput'. -To view top 100 match results sorted by z_cluster in console, use -``` -$ pyspark -_oss = spark.read.option("header", True).csv('/tmp/zinggOutput') -3:50 -z_oss = z_oss.sort('z_cluster') -3:50 -z_oss.count() -3:51 -z_oss.show(100) -``` - If records across multiple sources have to be matched, the [link phase](./link.md) should be used. From 8f6ebeefd69bf606dc7ac7b237ff1fc3c0a87aa8 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Wed, 3 Jul 2024 17:01:38 +0530 Subject: [PATCH 148/219] Update README.md updated java version --- docs/stepbystep/installation/installing-from-release/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/stepbystep/installation/installing-from-release/README.md b/docs/stepbystep/installation/installing-from-release/README.md index 66180845a..41df9e958 100644 --- a/docs/stepbystep/installation/installing-from-release/README.md +++ b/docs/stepbystep/installation/installing-from-release/README.md @@ -8,7 +8,7 @@ Zingg is prebuilt for common Spark versions so that you can use those directly. ## Prerequisites -A) Java JDK - version "1.8.0\_131" +A) Java JDK - version "11.0.23" B) Apache Spark - version spark-3.5.0-bin-hadoop3 From 95e1e7ac4d2943abc63e2ee9b8aa22478bb16aa4 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Wed, 3 Jul 2024 17:05:24 +0530 Subject: [PATCH 149/219] Update compiling-from-source.md updated jdk 1.8 to jdk 11 --- docs/stepbystep/installation/compiling-from-source.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/stepbystep/installation/compiling-from-source.md b/docs/stepbystep/installation/compiling-from-source.md index 1de4f32fa..d59f15e3b 100644 --- a/docs/stepbystep/installation/compiling-from-source.md +++ b/docs/stepbystep/installation/compiling-from-source.md @@ -7,7 +7,7 @@ description: For a different Spark version or compiling latest code If you need to compile the latest code or build for a different Spark version, you can clone this repo and * Install maven -* Install JDK 1.8 +* Install JDK 11 * Set JAVA\_HOME to JDK base directory * Run the following: `mvn initialize` and then `mvn clean compile package` From ef1b7d0787ce58f9591c0c3bd3f09215ed2a437b Mon Sep 17 00:00:00 2001 From: administrator Date: Fri, 5 Jul 2024 12:13:52 +0530 Subject: [PATCH 150/219] renamed --- .../src/test/java/zingg/client/TestSparkFrame.java | 9 +-------- .../{TestSparkFrameBase.java => TestZFrameBase.java} | 6 +++--- 2 files changed, 4 insertions(+), 11 deletions(-) rename spark/client/src/test/java/zingg/client/{TestSparkFrameBase.java => TestZFrameBase.java} (97%) diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index 3cfb3adce..8793abb3d 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -4,7 +4,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.Arrays; -import java.util.Date; import java.util.List; import org.apache.commons.logging.Log; @@ -12,21 +11,15 @@ import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.functions; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; import scala.collection.JavaConverters; import zingg.common.client.ZFrame; -import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.spark.client.SparkFrame; import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestSparkFrame extends TestSparkFrameBase { +public class TestSparkFrame extends TestZFrameBase { public static final Log LOG = LogFactory.getLog(TestSparkFrame.class); public static final String NEW_COLUMN = "newColumn"; diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrameBase.java b/spark/client/src/test/java/zingg/client/TestZFrameBase.java similarity index 97% rename from spark/client/src/test/java/zingg/client/TestSparkFrameBase.java rename to spark/client/src/test/java/zingg/client/TestZFrameBase.java index dcc75bd95..4944f45a8 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestZFrameBase.java @@ -25,13 +25,13 @@ import zingg.common.client.util.ColName; import zingg.spark.client.SparkFrame; -public class TestSparkFrameBase { +public class TestZFrameBase { public static IArguments args; public static JavaSparkContext ctx; public static SparkSession spark; - public static final Log LOG = LogFactory.getLog(TestSparkFrameBase.class); + public static final Log LOG = LogFactory.getLog(TestZFrameBase.class); public static final String STR_RECID = "recid"; public static final String STR_GIVENNAME = "givenname"; @@ -53,7 +53,7 @@ protected static void setUpSpark() { .appName("Zingg" + "Junit") .getOrCreate(); ctx = new JavaSparkContext(spark.sparkContext()); - JavaSparkContext.jarOfClass(TestSparkFrameBase.class); + JavaSparkContext.jarOfClass(TestZFrameBase.class); args = new Arguments(); } catch (Throwable e) { if (LOG.isDebugEnabled()) From f724e35f35b32722587f40089994a9dfdb64cc9e Mon Sep 17 00:00:00 2001 From: administrator Date: Fri, 5 Jul 2024 15:16:58 +0530 Subject: [PATCH 151/219] refactored tests --- .../common/client/util/DFObjectUtil.java | 25 + .../client/util/PojoToArrayConverter.java | 21 + .../client/util/StructTypeFromPojoClass.java | 30 + .../spark/client/util/RowsFromObjectList.java | 18 + .../spark/client/util/SparkDFObjectUtil.java | 34 + .../util/SparkStructTypeFromPojoClass.java | 48 ++ .../java/zingg/client/TestSparkFrame.java | 350 ++------ .../java/zingg/client/TestZFrameBase.java | 755 ++++++++++++++---- 8 files changed, 838 insertions(+), 443 deletions(-) create mode 100644 common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java create mode 100644 common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java create mode 100644 common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java create mode 100644 spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java create mode 100644 spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java create mode 100644 spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java diff --git a/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java new file mode 100644 index 000000000..e8a190a4d --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java @@ -0,0 +1,25 @@ +package zingg.common.client.util; + +import java.util.List; + +import zingg.common.client.ZFrame; + +public abstract class DFObjectUtil { + + S session; + + public DFObjectUtil(S s) { + this.session = s; + } + + public S getSession() { + return this.session; + } + + public void setSession(S session) { + this.session = session; + } + + public abstract ZFrame getDFFromObjectList(List objList, Class objClass) throws Exception; + +} diff --git a/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java b/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java new file mode 100644 index 000000000..5e7928521 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java @@ -0,0 +1,21 @@ +package zingg.common.client.util; + +import java.lang.reflect.Field; + +public class PojoToArrayConverter { + + public static Object[] getObjectArray(Object person) throws IllegalAccessException { + Field[] fields = person.getClass().getDeclaredFields(); + int fieldCount = fields.length; + Object[] objArr = new Object[fieldCount]; + + for (int i = 0; i < objArr.length; i++) { + Field field = fields[i]; + field.setAccessible(true); + + objArr[i] = field.get(person); + } + + return objArr; + } +} diff --git a/common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java b/common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java new file mode 100644 index 000000000..47c3620b8 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java @@ -0,0 +1,30 @@ +package zingg.common.client.util; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.List; + +public abstract class StructTypeFromPojoClass { + + public abstract ST getStructType(Class objClass) throws Exception; + + public List getFields(Class objClass) { + List structFields = new ArrayList(); + if (objClass.getSuperclass() != null) { + Field[] fieldsSuper = objClass.getSuperclass().getDeclaredFields(); + for (Field f : fieldsSuper) { + structFields.add(getStructField(f)); + } + } + Field[] fields = objClass.getDeclaredFields(); + for (Field f : fields) { + structFields.add(getStructField(f)); + } + return structFields; + } + + public abstract SF getStructField(Field field); + + public abstract T getSFType(Class t); + +} diff --git a/spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java b/spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java new file mode 100644 index 000000000..cb1a635be --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java @@ -0,0 +1,18 @@ +package zingg.spark.client.util; + +import java.util.List; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; + +import zingg.common.client.util.PojoToArrayConverter; + +public class RowsFromObjectList { + + public static Row[] getRows(List t) throws Exception{ + Row[] rows = new Row[t.size()]; + for (int i=0; i < t.size(); ++i){ + rows[i] = RowFactory.create(PojoToArrayConverter.getObjectArray(t.get(i))); + } + return rows; + } +} diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java new file mode 100644 index 000000000..1187fe533 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java @@ -0,0 +1,34 @@ +package zingg.spark.client.util; + +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.StructType; + +import zingg.common.client.ZFrame; +import zingg.common.client.util.DFObjectUtil; +import zingg.spark.client.SparkFrame; + +public class SparkDFObjectUtil extends DFObjectUtil, Row, Column> { + + public SparkDFObjectUtil(SparkSession s) { + super(s); + } + + @Override + public ZFrame, Row, Column> getDFFromObjectList(List objList, Class objClass) throws Exception { + if(objList==null || objClass==null) return null; + + SparkStructTypeFromPojoClass stpc = new SparkStructTypeFromPojoClass(); + + List rows = Arrays.asList(RowsFromObjectList.getRows(objList)); + StructType structType = stpc.getStructType(objClass); + return new SparkFrame(getSession().createDataFrame(rows, structType)); + } + + +} diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java b/spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java new file mode 100644 index 000000000..3032907f4 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java @@ -0,0 +1,48 @@ +package zingg.spark.client.util; + +import java.lang.reflect.Field; +import java.security.NoSuchAlgorithmException; +import java.util.List; + +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import zingg.common.client.util.StructTypeFromPojoClass; + +public class SparkStructTypeFromPojoClass extends StructTypeFromPojoClass { + + public StructType getStructType(Class objClass) + throws NoSuchAlgorithmException, IllegalArgumentException, IllegalAccessException { + List structFields = getFields(objClass); + return new StructType(structFields.toArray(new StructField[structFields.size()])); + } + + public StructField getStructField(Field field) { + field.setAccessible(true); + return new StructField(field.getName(), getSFType(field.getType()), true, Metadata.empty()); + } + + public DataType getSFType(Class t) { + if (t.getCanonicalName().contains("String")) { + return DataTypes.StringType; + } else if (t.getCanonicalName().contains("Integer")) { + return DataTypes.IntegerType; + } else if (t.getCanonicalName().contains("Long")) { + return DataTypes.LongType; + } else if (t.getCanonicalName().contains("Float")) { + return DataTypes.FloatType; + } else if (t.getCanonicalName().contains("Double")) { + return DataTypes.DoubleType; + } else if (t.getCanonicalName().contains("Date")) { + return DataTypes.DateType; + } else if (t.getCanonicalName().contains("Timestamp")) { + return DataTypes.TimestampType; + } + + return null; + } + +} \ No newline at end of file diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index 8793abb3d..5d09a8fba 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -8,34 +8,67 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.functions; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.*; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import scala.collection.JavaConverters; +import zingg.common.client.Arguments; +import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; +import zingg.common.client.util.DFObjectUtil; import zingg.spark.client.SparkFrame; +import zingg.spark.client.util.SparkDFObjectUtil; + import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestSparkFrame extends TestZFrameBase { +public class TestSparkFrame extends TestZFrameBase, Row, Column, DataType> { public static final Log LOG = LogFactory.getLog(TestSparkFrame.class); + public static IArguments args; + public static JavaSparkContext ctx; + public static SparkSession spark; + + @BeforeAll + public static void setup() { + setUpSpark(); + } + + protected static void setUpSpark() { + try { + spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + ctx = new JavaSparkContext(spark.sparkContext()); + JavaSparkContext.jarOfClass(TestZFrameBase.class); + args = new Arguments(); + } catch (Throwable e) { + if (LOG.isDebugEnabled()) + e.printStackTrace(); + LOG.info("Problem in spark env setup"); + } + } - public static final String NEW_COLUMN = "newColumn"; - - @Test - public void testCreateSparkDataFrameAndGetDF() { - SparkFrame sf = new SparkFrame(createSampleDataset()); - Dataset df = sf.df(); - assertTrue(df.except(createSampleDataset()).isEmpty(), "Two datasets are not equal"); + @AfterAll + public static void teardown() { + if (ctx != null) { + ctx.stop(); + ctx = null; + } + if (spark != null) { + spark.stop(); + spark = null; + } } - @Test - public void testColumnsNamesandCount() { - SparkFrame sf = new SparkFrame(createSampleDataset()); - assertTrue(Arrays.equals(sf.columns(), createSampleDataset().columns()), - "Columns of SparkFrame and the dataset are not equal"); + private SparkSession sparkSession; + + public TestSparkFrame() { + super(new SparkDFObjectUtil(spark)); } @Test @@ -46,272 +79,33 @@ public void testAliasOfSparkFrame() { assertTrueCheckingExceptOutput(sf.as(aliasName), sf, "Dataframe and its alias are not same"); } - @Test - public void testSelectWithSingleColumnName() { - Dataset df = createSampleDataset(); - ZFrame, Row, Column> sf = new SparkFrame(df); - String colName = "recid"; - ZFrame, Row, Column> sf2 = sf.select(colName); - SparkFrame sf3 = new SparkFrame(df.select(colName)); - assertTrueCheckingExceptOutput(sf2, sf3, "SparkFrame.select(colName) does not have expected value"); - } - - @Test - public void testSelectWithColumnList() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - List columnList = Arrays.asList(col("recid"), col("surname"), col("postcode")); - ZFrame, Row, Column> sf2 = sf.select(columnList); - SparkFrame sf3 = new SparkFrame( - df.select(JavaConverters.asScalaIteratorConverter(columnList.iterator()).asScala().toSeq())); - assertTrueCheckingExceptOutput(sf2, sf3, "SparkFrame.select(columnList) does not have expected value"); - } - - @Test - public void testSelectWithColumnArray() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - Column[] columnArray = new Column[] {col("recid"), col("surname"), col("postcode")}; - ZFrame, Row, Column> sf2 = sf.select(columnArray); - SparkFrame sf3 = new SparkFrame(df.select(columnArray)); - assertTrueCheckingExceptOutput(sf2, sf3, "SparkFrame.select(columnArray) value does not match with standard select output"); - } - - @Test - public void testSelectWithMultipleColumnNamesAsString() { - Dataset df = createSampleDataset(); - ZFrame, Row, Column> sf = new SparkFrame(df); - ZFrame, Row, Column> sf2 = sf.select("recid", "surname", "postcode"); - SparkFrame sf3 = new SparkFrame(df.select("recid", "surname", "postcode")); - assertTrueCheckingExceptOutput(sf2, sf3, "SparkFrame.select(str1, str2, ...) value does not match with standard select output"); - } - - @Test - public void testSelectExprByPassingColumnStringsAsInSQLStatement() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - ZFrame, Row, Column> sf2 = sf.selectExpr("recid as RecordId", "surname as FamilyName", "postcode as Pin"); - SparkFrame sf3 = new SparkFrame(df.selectExpr("recid", "surname", "postcode")); - assertTrueCheckingExceptOutput(sf2, sf3, "SparkFrame.selectExpr(str1, str2, ...) value does not match with standard selectExpr output"); - } - - @Test - public void testDistinct() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - SparkFrame sf2 = new SparkFrame(df.distinct()); - assertTrueCheckingExceptOutput(sf.distinct(), sf2, "SparkFrame.distict() does not match with standard distict() output"); - } - - @Test - public void testDropSingleColumn() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - ZFrame, Row, Column> sf2 = new SparkFrame(df.drop("recid")); - assertTrueCheckingExceptOutput(sf2, sf.drop("recid"), "SparkFrame.drop(str) does not match with standard drop() output"); - } - - @Test - public void testDropColumnsAsStringArray() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - ZFrame, Row, Column> sf2 = new SparkFrame(df.drop("recid", "surname", "postcode")); - assertTrueCheckingExceptOutput(sf2, sf.drop("recid", "surname", "postcode"), "SparkFrame.drop(str...) does not match with standard drop(str...) output"); - } - - @Test - public void testLimit() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - int len = 5; - ZFrame, Row, Column> sf2 = sf.limit(len); - assertTrue(sf2.count() == len); - assertTrueCheckingExceptOutput(sf2, sf.limit(len), "SparkFrame.limit(len) does not match with standard limit(len) output"); - } - - @Test - public void testDropDuplicatesConsideringGivenColumnsAsStringArray() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - String[] columnArray = new String[] {"surname", "postcode"}; - ZFrame, Row, Column> sf2 = new SparkFrame(df.dropDuplicates(columnArray)); - assertTrueCheckingExceptOutput(sf2, sf.dropDuplicates(columnArray), "SparkFrame.dropDuplicates(str[]) does not match with standard dropDuplicates(str[]) output"); - } - - @Test - public void testDropDuplicatesConsideringGivenIndividualColumnsAsString() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - ZFrame, Row, Column> sf2 = new SparkFrame(df.dropDuplicates("surname", "postcode")); - assertTrueCheckingExceptOutput(sf2, sf.dropDuplicates("surname"), "SparkFrame.dropDuplicates(col1, col2) does not match with standard dropDuplicates(col1, col2) output"); - } + public Dataset createSampleDataset() { - @Test - public void testHead() { - Dataset df = createSampleDataset(); - SparkFrame sf = new SparkFrame(df); - Row row = sf.head(); - assertTrue(row.equals(df.head()), "Top Row is not the expected one"); - } - - @Test - public void testIsEmpty() { if (spark==null) { setUpSpark(); } - Dataset df = spark.emptyDataFrame(); - SparkFrame sf = new SparkFrame(df); - assertTrue(sf.isEmpty(), "DataFrame is not empty"); - } - - @Test - public void testGetAsInt() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - Row row = sf.head(); - LOG.debug("Value: " + row.getAs("recid")); - assertTrue(sf.getAsInt(row, "recid") == (int) row.getAs("recid"), "row.getAsInt(col) hasn't returned correct int value"); - } - @Test - public void testGetAsString() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - Row row = sf.head(); - LOG.debug("Value: " + row.getAs("surname")); - assertTrue(sf.getAsString(row, "surname").equals(row.getAs("surname")), "row.getAsString(col) hasn't returned correct string value"); - } - @Test - public void testGetAsDouble() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - Row row = sf.head(); - LOG.debug("Value: " + row.getAs("cost")); - assertTrue(sf.getAsDouble(row, "cost") == (double) row.getAs("cost"), "row.getAsDouble(col) hasn't returned correct double value"); - } - @Test - public void testSortDescending() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - String col = STR_RECID; - ZFrame,Row,Column> sf2 = sf.sortDescending(col); - assertTrueCheckingExceptOutput(sf2, df.sort(functions.desc(col)), "SparkFrame.sortDescending() output is not as expected"); - } - - @Test - public void testSortAscending() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - String col = STR_RECID; - ZFrame,Row,Column> sf2 = sf.sortAscending(col); - assertTrueCheckingExceptOutput(sf2, df.sort(functions.asc(col)), "SparkFrame.sortAscending() output is not as expected"); - } - @Test - public void testWithColumnforIntegerValue() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - String newCol = NEW_COLUMN; - int newColVal = 36; - ZFrame,Row,Column> sf2 = sf.withColumn(newCol, newColVal); - assertTrueCheckingExceptOutput(sf2, df.withColumn(newCol, functions.lit(newColVal)), "SparkFrame.withColumn(c, int) output is not as expected"); - } - - @Test - public void testWithColumnforDoubleValue() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - String newCol = NEW_COLUMN; - double newColVal = 3.14; - ZFrame,Row,Column> sf2 = sf.withColumn(newCol, newColVal); - assertTrueCheckingExceptOutput(sf2, df.withColumn(newCol, functions.lit(newColVal)), "SparkFrame.withColumn(c, double) output is not as expected"); - } - - @Test - public void testWithColumnforStringValue() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - String newCol = NEW_COLUMN; - String newColVal = "zingg"; - ZFrame,Row,Column> sf2 = sf.withColumn(newCol, newColVal); - assertTrueCheckingExceptOutput(sf2, df.withColumn(newCol, functions.lit(newColVal)), "SparkFrame.withColumn(c, String) output is not as expected"); + StructType schemaOfSample = new StructType(new StructField[] { + new StructField("recid", DataTypes.StringType, false, Metadata.empty()), + new StructField("givenname", DataTypes.StringType, false, Metadata.empty()), + new StructField("surname", DataTypes.StringType, false, Metadata.empty()), + new StructField("suburb", DataTypes.StringType, false, Metadata.empty()), + new StructField("postcode", DataTypes.StringType, false, Metadata.empty()) + }); + + Dataset sample = spark.createDataFrame(Arrays.asList( + RowFactory.create("07317257", "erjc", "henson", "hendersonville", "2873g"), + RowFactory.create("03102490", "jhon", "kozak", "henders0nville", "28792"), + RowFactory.create("02890805", "david", "pisczek", "durham", "27717"), + RowFactory.create("04437063", "e5in", "bbrown", "greenville", "27858"), + RowFactory.create("03211564", "susan", "jones", "greenjboro", "274o7"), + RowFactory.create("04155808", "jerome", "wilkins", "battleborn", "2780g"), + RowFactory.create("05723231", "clarinw", "pastoreus", "elizabeth city", "27909"), + RowFactory.create("06087743", "william", "craven", "greenshoro", "27405"), + RowFactory.create("00538491", "marh", "jackdon", "greensboro", "27406"), + RowFactory.create("01306702", "vonnell", "palmer", "siler sity", "273q4")), schemaOfSample); + + return sample; } - @Test - public void testWithColumnforAnotherColumn() { - Dataset df = createSampleDatasetHavingMixedDataTypes(); - SparkFrame sf = new SparkFrame(df); - String oldCol = STR_RECID; - String newCol = NEW_COLUMN; - ZFrame,Row,Column> sf2 = sf.withColumn(newCol, col(oldCol)); - assertTrueCheckingExceptOutput(sf2, df.withColumn(newCol, col(oldCol)), "SparkFrame.withColumn(c, Column) output is not as expected"); - } - - @Test - public void testGetMaxVal(){ - SparkFrame zScoreDF = getZScoreDF(); - assertEquals(400,zScoreDF.getMaxVal(ColName.CLUSTER_COLUMN)); - } - - @Test - public void testGroupByMinMax(){ - SparkFrame zScoreDF = getZScoreDF(); - ZFrame, Row, Column> groupByDF = zScoreDF.groupByMinMaxScore(zScoreDF.col(ColName.ID_COL)); - - Dataset assertionDF = groupByDF.df(); - List assertionRows = assertionDF.collectAsList(); - for (Row row : assertionRows) { - if(row.getInt(0)==1) { - assertEquals(1001,row.getInt(1)); - assertEquals(2002,row.getInt(2)); - } - } - } - - @Test - public void testGroupByMinMax2(){ - SparkFrame zScoreDF = getZScoreDF(); - ZFrame, Row, Column> groupByDF = zScoreDF.groupByMinMaxScore(zScoreDF.col(ColName.CLUSTER_COLUMN)); - - Dataset assertionDF = groupByDF.df(); - List assertionRows = assertionDF.collectAsList(); - for (Row row : assertionRows) { - if(row.getInt(0)==100) { - assertEquals(900,row.getInt(1)); - assertEquals(9002,row.getInt(2)); - } - } - } - - @Test - public void testRightJoinMultiCol(){ - ZFrame, Row, Column> inpData = getInputData(); - ZFrame, Row, Column> clusterData = getClusterData(); - ZFrame, Row, Column> joinedData = clusterData.join(inpData,ColName.ID_COL,ColName.SOURCE_COL,ZFrame.RIGHT_JOIN); - assertEquals(10,joinedData.count()); - } - - @Test - public void testFilterInCond(){ - SparkFrame inpData = getInputData(); - SparkFrame clusterData = getClusterDataWithNull(); - ZFrame, Row, Column> filteredData = inpData.filterInCond(ColName.ID_COL, clusterData, ColName.COL_PREFIX+ ColName.ID_COL); - assertEquals(5,filteredData.count()); - } - - @Test - public void testFilterNotNullCond(){ - SparkFrame clusterData = getClusterDataWithNull(); - ZFrame, Row, Column> filteredData = clusterData.filterNotNullCond(ColName.SOURCE_COL); - assertEquals(3,filteredData.count()); - } - - @Test - public void testFilterNullCond(){ - SparkFrame clusterData = getClusterDataWithNull(); - ZFrame, Row, Column> filteredData = clusterData.filterNullCond(ColName.SOURCE_COL); - assertEquals(2,filteredData.count()); - } - - - } \ No newline at end of file diff --git a/spark/client/src/test/java/zingg/client/TestZFrameBase.java b/spark/client/src/test/java/zingg/client/TestZFrameBase.java index 4944f45a8..567c0ce90 100644 --- a/spark/client/src/test/java/zingg/client/TestZFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestZFrameBase.java @@ -1,17 +1,15 @@ package zingg.client; -import static org.junit.jupiter.api.Assertions.assertTrue; - +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; +import java.util.Optional; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.*; +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -19,20 +17,25 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import zingg.common.client.Arguments; import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; +import zingg.common.client.util.DFObjectUtil; +import zingg.common.client.util.PojoToArrayConverter; import zingg.spark.client.SparkFrame; -public class TestZFrameBase { +import static org.apache.spark.sql.functions.col; +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; - public static IArguments args; - public static JavaSparkContext ctx; - public static SparkSession spark; +public abstract class TestZFrameBase { - public static final Log LOG = LogFactory.getLog(TestZFrameBase.class); + private final DFObjectUtil dfObjectUtil; + public static final Log LOG = LogFactory.getLog(TestZFrameBase.class); + public static final String NEW_COLUMN = "newColumn"; public static final String STR_RECID = "recid"; public static final String STR_GIVENNAME = "givenname"; public static final String STR_SURNAME = "surname"; @@ -40,176 +43,598 @@ public class TestZFrameBase { public static final String STR_POSTCODE = "postcode"; public static final String STR_SUBURB = "suburb"; - @BeforeAll - public static void setup() { - setUpSpark(); - } - - protected static void setUpSpark() { - try { - spark = SparkSession - .builder() - .master("local[*]") - .appName("Zingg" + "Junit") - .getOrCreate(); - ctx = new JavaSparkContext(spark.sparkContext()); - JavaSparkContext.jarOfClass(TestZFrameBase.class); - args = new Arguments(); - } catch (Throwable e) { - if (LOG.isDebugEnabled()) - e.printStackTrace(); - LOG.info("Problem in spark env setup"); + public TestZFrameBase(DFObjectUtil dfObjectUtil) { + this.dfObjectUtil = dfObjectUtil; + } + + + @Test + public void testCreateSparkDataFrameAndGetDF() throws Exception { + List sampleDataSet = createSampleDataList(); + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + //assert rows + List pojoList = (List) zFrame.collectAsList(); + for(int idx = 0; idx < sampleDataSet.size(); idx++) { + assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); + } + } + + @Test + public void testColumnsNamesandCount() throws Exception { + List sampleDataSet = createSampleDataList(); + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + //assert on fields + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(zFrame.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); + } + + @Test + public void testSelectWithSingleColumnName() throws Exception { + List sampleDataSet = createSampleDataList(); //List + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String colName = "recid"; + List pojoList = (List) zFrame.select(colName).collectAsList(); + + for (int idx = 0; idx < sampleDataSet.size(); idx++){ + assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); + } + } + + @Test + public void testSelectWithColumnList() throws Exception { + List sampleDataSet = createSampleDataList(); //List + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List columnList = Arrays.asList(col("recid"), col("surname"), col("postcode")); + + List pojoList = (List) zFrame.select((List) columnList).collectAsList(); + + for(int idx = 0; idx < sampleDataSet.size(); idx++) { + assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } } - @AfterAll - public static void teardown() { - if (ctx != null) { - ctx.stop(); - ctx = null; + @Test + public void testSelectWithColumnArray() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + Column[] columnArray = new Column[] { col("recid"), col("surname"), col("postcode") }; + + List pojoList = (List) zFrame.select((C)columnArray).collectAsList(); + + for(int idx = 0; idx < sampleDataSet.size(); idx++) { + assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } - if (spark != null) { - spark.stop(); - spark = null; + } + + @Test + public void testSelectWithMultipleColumnNamesAsString() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List pojoList = (List) zFrame.select("recid", "surname", "postcode").collectAsList(); + + for(int idx = 0; idx < sampleDataSet.size(); idx++) { + assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } } - public Dataset createSampleDataset() { - - if (spark==null) { - setUpSpark(); + @Test + public void testSelectExprByPassingColumnStringsAsInSQLStatement() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List pojoList = (List) zFrame.selectExpr("recid as RecordId", "surname as FamilyName", + "postcode as Pin").collectAsList(); + + for(int idx = 0; idx < sampleDataSet.size(); idx++) { + assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); + assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } - - StructType schemaOfSample = new StructType(new StructField[] { - new StructField("recid", DataTypes.StringType, false, Metadata.empty()), - new StructField("givenname", DataTypes.StringType, false, Metadata.empty()), - new StructField("surname", DataTypes.StringType, false, Metadata.empty()), - new StructField("suburb", DataTypes.StringType, false, Metadata.empty()), - new StructField("postcode", DataTypes.StringType, false, Metadata.empty()) - }); - - Dataset sample = spark.createDataFrame(Arrays.asList( - RowFactory.create("07317257", "erjc", "henson", "hendersonville", "2873g"), - RowFactory.create("03102490", "jhon", "kozak", "henders0nville", "28792"), - RowFactory.create("02890805", "david", "pisczek", "durham", "27717"), - RowFactory.create("04437063", "e5in", "bbrown", "greenville", "27858"), - RowFactory.create("03211564", "susan", "jones", "greenjboro", "274o7"), - RowFactory.create("04155808", "jerome", "wilkins", "battleborn", "2780g"), - RowFactory.create("05723231", "clarinw", "pastoreus", "elizabeth city", "27909"), - RowFactory.create("06087743", "william", "craven", "greenshoro", "27405"), - RowFactory.create("00538491", "marh", "jackdon", "greensboro", "27406"), - RowFactory.create("01306702", "vonnell", "palmer", "siler sity", "273q4")), schemaOfSample); + } - return sample; + @Test + public void testDropSingleColumn() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList<>(); + Arrays.stream(zFrame.drop("recid").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.remove("recid"); + + assertEquals(fieldsInTestData, fieldsInZFrame, "Fields in zFrame and sample data doesn't match"); + } + + @Test + public void testDropColumnsAsStringArray() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList<>(); + Arrays.stream(zFrame.drop("recid", "surname", "postcode").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.remove("recid"); + fieldsInTestData.remove("surname"); + fieldsInTestData.remove("postcode"); + + assertEquals(fieldsInTestData, fieldsInZFrame, "Fields in zFrame and sample data doesn't match"); } - public Dataset createSampleDatasetHavingMixedDataTypes() { - if (spark==null) { - setUpSpark(); + @Test + public void testLimit() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + int len = 5; + List pojoList = (List) zFrame.limit(len).collectAsList(); + + assertEquals(pojoList.size(), len, "Size is not equal"); + + //assert on rows + for(int idx = 0; idx < len; idx++) { + assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); } - - StructType schemaOfSample = new StructType(new StructField[] { - new StructField(STR_RECID, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(STR_GIVENNAME, DataTypes.StringType, false, Metadata.empty()), - new StructField(STR_SURNAME, DataTypes.StringType, false, Metadata.empty()), - new StructField(STR_COST, DataTypes.DoubleType, false, Metadata.empty()), - new StructField(STR_POSTCODE, DataTypes.IntegerType, false, Metadata.empty()) - }); - - Dataset sample = spark.createDataFrame(Arrays.asList( - RowFactory.create(7317, "erjc", "henson", 0.54, 2873), - RowFactory.create(3102, "jhon", "kozak", 99.009, 28792), - RowFactory.create(2890, "david", "pisczek", 58.456, 27717), - RowFactory.create(4437, "e5in", "bbrown", 128.45, 27858) - ), schemaOfSample); + } + + @Test + public void testHead() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + GenericRowWithSchema row = (GenericRowWithSchema) zFrame.head(); + + assertArrayEquals(row.values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(0)), + "Top row from zFrame and sample data doesn't match"); + } + + @Test + public void testGetAsInt() throws Exception { + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + + R row = zFrame.head(); + assertTrue(zFrame.getAsInt(row, "recid") == sampleDataSet.get(0).recid, + "row.getAsInt(col) hasn't returned correct int value"); + } + + @Test + public void testGetAsString() throws Exception { + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + + R row = zFrame.head(); + assertEquals(zFrame.getAsString(row, "surname"), sampleDataSet.get(0).surname, "row.getAsString(col) hasn't returned correct string value"); + } + + @Test + public void testGetAsDouble() throws Exception { + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + + R row = zFrame.head(); + assertEquals(zFrame.getAsDouble(row, "cost"), sampleDataSet.get(0).cost, "row.getAsDouble(col) hasn't returned correct double value"); + } + + @Test + public void testWithColumnForIntegerValue() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + String newCol = NEW_COLUMN; + int newColVal = 36; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + assertEquals(row.getAs(newCol), Integer.valueOf(newColVal), "value of added column is not as expected"); + } + + @Test + public void testWithColumnForDoubleValue() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String newCol = NEW_COLUMN; + double newColVal = 3.14; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + assertEquals(row.getAs(newCol), Double.valueOf(newColVal), "value of added column is not as expected"); + } + + @Test + public void testWithColumnForStringValue() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String newCol = NEW_COLUMN; + String newColVal = "zingg"; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + assertEquals(row.getAs(newCol), newColVal, "value of added column is not as expected"); + } + + @Test + public void testWithColumnforAnotherColumn() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String oldCol = STR_RECID; + String newCol = NEW_COLUMN; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, (C) col(oldCol)); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + assertEquals(Optional.of(row.getAs(newCol)), Optional.of(row.getAs(oldCol)), "value of added column is not as expected"); + } + + @Test + public void testGetMaxVal() throws Exception { + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + + assertEquals(400, zFrame.getMaxVal(ColName.CLUSTER_COLUMN), "Max value is not as expected"); + } + + @Test + public void testGroupByMinMax() throws Exception { + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + + ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.ID_COL)); + +// Dataset assertionDF = groupByDF.df(); +// List assertionRows = assertionDF.collectAsList(); +// for (Row row : assertionRows) { +// if(row.getInt(0)==1) { +// assertEquals(1001,row.getInt(1)); +// assertEquals(2002,row.getInt(2)); +// } +// } + } + + @Test + public void testGroupByMinMax2() throws Exception { + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + + ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.CLUSTER_COLUMN)); + +// Dataset assertionDF = groupByDF.df(); +// List assertionRows = assertionDF.collectAsList(); +// for (Row row : assertionRows) { +// if(row.getInt(0)==100) { +// assertEquals(900,row.getInt(1)); +// assertEquals(9002,row.getInt(2)); +// } +// } + } + + @Test + public void testRightJoinMultiCol() throws Exception { + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); + List sampleDataSetCluster = createSampleDataCluster(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); + + ZFrame joinedData = zFrameCluster.join(zFrameInput,ColName.ID_COL,ColName.SOURCE_COL,ZFrame.RIGHT_JOIN); + assertEquals(10,joinedData.count()); + } + + @Test + public void testFilterInCond() throws Exception { + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); + ZFrame filteredData = zFrameInput.filterInCond(ColName.ID_COL, zFrameCluster, ColName.COL_PREFIX+ ColName.ID_COL); + assertEquals(5,filteredData.count()); + } + + @Test + public void testFilterNotNullCond() throws Exception { + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); + + ZFrame filteredData = zFrameCluster.filterNotNullCond(ColName.SOURCE_COL); + assertEquals(3,filteredData.count()); + } + + @Test + public void testFilterNullCond() throws Exception { + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); + + ZFrame filteredData = zFrameCluster.filterNullCond(ColName.SOURCE_COL); + assertEquals(2, filteredData.count()); + } + + @Test + public void testDropDuplicatesConsideringGivenColumnsAsStringArray() throws Exception{ + List sampleDataSetCluster = createSampleDataList(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Schema.class); + + String[] columnArray = new String[] {"surname", "postcode"}; + ZFrame zFrameDeDuplicated = zFrame.dropDuplicates(columnArray); + +// ZFrame, Row, Column> sf2 = new SparkFrame(df.dropDuplicates(columnArray)); +// assertTrueCheckingExceptOutput(sf2, sf.dropDuplicates(columnArray), "SparkFrame.dropDuplicates(str[]) does not match with standard dropDuplicates(str[]) output"); + } + + @Test + public void testDropDuplicatesConsideringGivenIndividualColumnsAsString() throws Exception { + List sampleDataSetCluster = createSampleDataList(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Schema.class); + ZFrame zFrameDeDuplicated = zFrame.dropDuplicates("surname", "postcode"); + +// ZFrame, Row, Column> sf2 = new SparkFrame(df.dropDuplicates("surname", "postcode")); +// assertTrueCheckingExceptOutput(sf2, sf.dropDuplicates("surname"), "SparkFrame.dropDuplicates(col1, col2) does not match with standard dropDuplicates(col1, col2) output"); + } + + @Test + public void testSortDescending() throws Exception { + List sampleData = createSampleDataListWithMixedDataType(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); + + String col = STR_RECID; + ZFrame zFrameSortedDesc = zFrame.sortDescending(col); + +// SparkFrame sf = new SparkFrame(df); +// String col = STR_RECID; +// ZFrame,Row,Column> sf2 = sf.sortDescending(col); +// assertTrueCheckingExceptOutput(sf2, df.sort(functions.desc(col)), "SparkFrame.sortDescending() output is not as expected"); + } + + @Test + public void testSortAscending() throws Exception { + List sampleData = createSampleDataListWithMixedDataType(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); + + String col = STR_RECID; + ZFrame zFrameSortedDesc = zFrame.sortAscending(col); + +// SparkFrame sf = new SparkFrame(df); +// String col = STR_RECID; +// ZFrame,Row,Column> sf2 = sf.sortAscending(col); +// assertTrueCheckingExceptOutput(sf2, df.sort(functions.asc(col)), "SparkFrame.sortAscending() output is not as expected"); + } + + @Test + public void testIsEmpty() { +// ZFrame +// Dataset df = sparkSession.emptyDataFrame(); +// SparkFrame sf = new SparkFrame(df); +// assertTrue(sf.isEmpty(), "DataFrame is not empty"); + } + + @Test + public void testDistinct() throws Exception { + List sampleDataSet = createSampleDataList(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + zFrame.distinct(); +// SparkFrame sf = new SparkFrame(df); +// SparkFrame sf2 = new SparkFrame(df.distinct()); +// assertTrueCheckingExceptOutput(sf.distinct(), sf2, "SparkFrame.distict() does not match with standard distict() output"); + } + + //sample data to be used for testing + public static List createSampleDataList() { + List sample = new ArrayList(); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); return sample; } - protected SparkFrame getZScoreDF() { - Row[] rows = { - RowFactory.create( 0,100,900), - RowFactory.create( 1,100,1001), - RowFactory.create( 1,100,1002), - RowFactory.create( 1,100,2001), - RowFactory.create( 1,100,2002), - RowFactory.create( 11,100,9002), - RowFactory.create( 3,300,3001), - RowFactory.create( 3,300,3002), - RowFactory.create( 3,400,4001), - RowFactory.create( 4,400,4002) - }; - StructType schema = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.CLUSTER_COLUMN, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.SCORE_COL, DataTypes.IntegerType, false, Metadata.empty())}); - SparkFrame df = new SparkFrame(spark.createDataFrame(Arrays.asList(rows), schema)); - return df; - } - - protected SparkFrame getInputData() { - Row[] rows = { - RowFactory.create( 1,"fname1","b"), - RowFactory.create( 2,"fname","a"), - RowFactory.create( 3,"fna","b"), - RowFactory.create( 4,"x","c"), - RowFactory.create( 5,"y","c"), - RowFactory.create( 11,"new1","b"), - RowFactory.create( 22,"new12","a"), - RowFactory.create( 33,"new13","b"), - RowFactory.create( 44,"new14","c"), - RowFactory.create( 55,"new15","c") - }; - StructType schema = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.IntegerType, false, Metadata.empty()), - new StructField("fname", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty())}); - SparkFrame df = new SparkFrame(spark.createDataFrame(Arrays.asList(rows), schema)); - return df; - } - - - protected SparkFrame getClusterData() { - Row[] rows = { - RowFactory.create( 1,100,1001,"b"), - RowFactory.create( 2,100,1002,"a"), - RowFactory.create( 3,100,2001,"b"), - RowFactory.create( 4,900,2002,"c"), - RowFactory.create( 5,111,9002,"c") - }; - StructType schema = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.CLUSTER_COLUMN, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.SCORE_COL, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty())}); - SparkFrame df = new SparkFrame(spark.createDataFrame(Arrays.asList(rows), schema)); - return df; - } - - protected SparkFrame getClusterDataWithNull() { - Row[] rows = { - RowFactory.create( 1,100,1001,"b"), - RowFactory.create( 2,100,1002,"a"), - RowFactory.create( 3,100,2001,null), - RowFactory.create( 4,900,2002,"c"), - RowFactory.create( 5,111,9002,null) - }; - StructType schema = new StructType(new StructField[] { - new StructField(ColName.COL_PREFIX+ ColName.ID_COL, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.CLUSTER_COLUMN, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.SCORE_COL, DataTypes.IntegerType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, true, Metadata.empty())}); - SparkFrame df = new SparkFrame(spark.createDataFrame(Arrays.asList(rows), schema)); - return df; - } - - protected void assertTrueCheckingExceptOutput(ZFrame, Row, Column> sf1, ZFrame, Row, Column> sf2, String message) { - assertTrue(sf1.except(sf2).isEmpty(), message); + public static List createSampleDataListWithMixedDataType() { + List sample = new ArrayList(); + sample.add(new SchemaWithMixedDataType(7317257, "erjc", "henson", 10.021, 2873)); + sample.add(new SchemaWithMixedDataType(3102490, "jhon", "kozak", 3.2434, 28792)); + sample.add(new SchemaWithMixedDataType(2890805, "david", "pisczek", 5436.0232, 27717)); + sample.add(new SchemaWithMixedDataType(4437063, "e5in", "bbrown", 67.0, 27858)); + sample.add(new SchemaWithMixedDataType(3211564, "susan", "jones", 7343.2324, 2747)); + + sample.add(new SchemaWithMixedDataType(4155808, "jerome", "wilkins", 50.34, 2780)); + sample.add(new SchemaWithMixedDataType(5723231, "clarinw", "pastoreus", 87.2323, 27909)); + sample.add(new SchemaWithMixedDataType(6087743, "william", "craven", 834.123, 27405)); + sample.add(new SchemaWithMixedDataType(538491, "marh", "jackdon", 123.123, 27406)); + sample.add(new SchemaWithMixedDataType(1306702, "vonnell", "palmer", 83.123, 2734)); + + return sample; } - - - protected void assertTrueCheckingExceptOutput(ZFrame, Row, Column> sf1, Dataset df2, String message) { - SparkFrame sf2 = new SparkFrame(df2); + + public static List createSampleDataZScore() { + + List sample = new ArrayList<>(); + sample.add(new SchemaZScore(0, 100, 900)); + sample.add(new SchemaZScore(1, 100, 101)); + sample.add(new SchemaZScore(1, 100, 2001)); + sample.add(new SchemaZScore(1, 100, 2002)); + sample.add(new SchemaZScore(11, 100, 2002)); + sample.add(new SchemaZScore(3, 300, 3002)); + sample.add(new SchemaZScore(3, 400, 4002)); + + return sample; + } + + public static List createSampleDataCluster() { + + List sample = new ArrayList<>(); + sample.add(new SchemaCluster(1,100,1001,"b")); + sample.add(new SchemaCluster(2,100,1002,"a")); + sample.add(new SchemaCluster(3,100,2001,"b")); + sample.add(new SchemaCluster(4, 900, 2002, "c")); + sample.add(new SchemaCluster(5, 111, 9002, "c")); + + return sample; + } + + public static List createSampleDataClusterWithNull() { + + List sample = new ArrayList<>(); + sample.add(new SchemaCluster( 1,100,1001,"b")); + sample.add(new SchemaCluster( 2,100,1002,"a")); + sample.add(new SchemaCluster( 3,100,2001,null)); + sample.add(new SchemaCluster(4,900,2002,"c")); + sample.add(new SchemaCluster( 5,111,9002,null)); + + return sample; + } + + public static List createSampleDataInput() { + + List sample = new ArrayList<>(); + sample.add(new SchemaInput(1,"fname1","b" )); + sample.add(new SchemaInput( 2,"fname","a")); + sample.add(new SchemaInput(3,"fna","b")); + sample.add((new SchemaInput(4,"x","c"))); + sample.add(new SchemaInput(5,"y","c")); + sample.add(new SchemaInput(11,"new1","b")); + sample.add(new SchemaInput(22,"new12","a")); + sample.add(new SchemaInput(33,"new13","b")); + sample.add(new SchemaInput( 44,"new14","c")); + sample.add(new SchemaInput(55,"new15","c")); + + return sample; + } + + + protected void assertTrueCheckingExceptOutput(ZFrame sf1, ZFrame sf2, String message) { assertTrue(sf1.except(sf2).isEmpty(), message); } + + //POJO classes for defining schema + + public static class Schema { + public final String recid; + public final String givenname; + public final String surname; + public final String suburb; + public final String postcode; + + public Schema(String recid, String givename, String surname, String suburb, String postcode) { + this.recid = recid; + this.givenname = givename; + this.surname = surname; + this.suburb = suburb; + this.postcode = postcode; + } + } + + public static class SchemaWithMixedDataType { + public final Integer recid; + public final String givenname; + public final String surname; + public final Double cost; + public final Integer postcode; + + public SchemaWithMixedDataType(Integer recid, String givename, String surname, Double cost, Integer postcode) { + this.recid = recid; + this.givenname = givename; + this.surname = surname; + this.cost = cost; + this.postcode = postcode; + } + } + + public static class SchemaZScore { + public final Integer z_zid; + public final Integer z_cluster; + public final Integer z_score; + + public SchemaZScore(Integer z_zid, Integer z_cluster, Integer z_score) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + } + } + + public static class SchemaCluster { + public final Integer z_zid; + public final Integer z_cluster; + public final Integer z_score; + public final String z_zsource; + + public SchemaCluster(Integer z_zid, Integer z_cluster, Integer z_score, String z_zsource) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this. z_score = z_score; + this.z_zsource = z_zsource; + } + } + + public static class SchemaInput { + public final Integer z_zid; + public final String fname; + public final String z_zsource; + + public SchemaInput(Integer z_zid, String fname, String z_zsource) { + this.z_zid = z_zid; + this.fname = fname; + this.z_zsource = z_zsource; + } + } } \ No newline at end of file From bf99a8b2d98d7a9312dbdcbf574705c713dc4422 Mon Sep 17 00:00:00 2001 From: administrator Date: Fri, 5 Jul 2024 15:24:46 +0530 Subject: [PATCH 152/219] removed unused imports --- .../test/java/zingg/client/TestSparkFrame.java | 5 ----- .../test/java/zingg/client/TestZFrameBase.java | 15 --------------- 2 files changed, 20 deletions(-) diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index 5d09a8fba..0be7144a2 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -4,7 +4,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.Arrays; -import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -15,12 +14,8 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import scala.collection.JavaConverters; import zingg.common.client.Arguments; import zingg.common.client.IArguments; -import zingg.common.client.ZFrame; -import zingg.common.client.util.ColName; -import zingg.common.client.util.DFObjectUtil; import zingg.spark.client.SparkFrame; import zingg.spark.client.util.SparkDFObjectUtil; diff --git a/spark/client/src/test/java/zingg/client/TestZFrameBase.java b/spark/client/src/test/java/zingg/client/TestZFrameBase.java index 567c0ce90..48a07ec92 100644 --- a/spark/client/src/test/java/zingg/client/TestZFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestZFrameBase.java @@ -7,24 +7,14 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.*; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import zingg.common.client.Arguments; -import zingg.common.client.IArguments; import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.client.util.PojoToArrayConverter; -import zingg.spark.client.SparkFrame; import static org.apache.spark.sql.functions.col; import static org.junit.jupiter.api.Assertions.*; @@ -37,11 +27,6 @@ public abstract class TestZFrameBase { public static final Log LOG = LogFactory.getLog(TestZFrameBase.class); public static final String NEW_COLUMN = "newColumn"; public static final String STR_RECID = "recid"; - public static final String STR_GIVENNAME = "givenname"; - public static final String STR_SURNAME = "surname"; - public static final String STR_COST = "cost"; - public static final String STR_POSTCODE = "postcode"; - public static final String STR_SUBURB = "suburb"; public TestZFrameBase(DFObjectUtil dfObjectUtil) { this.dfObjectUtil = dfObjectUtil; From 9db584927e8f18ff55371aabb705b9d781ac656c Mon Sep 17 00:00:00 2001 From: administrator Date: Sat, 6 Jul 2024 04:06:14 +0530 Subject: [PATCH 153/219] refactored junits --- pom.xml | 2 +- spark/client/pom.xml | 6 +- .../java/zingg/client/TestSparkFrame.java | 3 - .../java/zingg/client/TestZFrameBase.java | 1349 +++++++++-------- 4 files changed, 746 insertions(+), 614 deletions(-) diff --git a/pom.xml b/pom.xml index c63f6ffcc..76decc5e6 100644 --- a/pom.xml +++ b/pom.xml @@ -146,7 +146,7 @@ com.fasterxml.jackson.module jackson-module-scala_2.12 - 2.12.2 + 2.17.1 diff --git a/spark/client/pom.xml b/spark/client/pom.xml index 10ee01930..28be7db77 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -8,8 +8,8 @@ zingg-spark-client jar - 2.15.2 - 2.15.2 + 2.17.0 + 2.17.0 @@ -46,7 +46,7 @@ net.alchim31.maven scala-maven-plugin - 4.8.0 + 4.9.2 scala-compile-first diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index 0be7144a2..9446d5e93 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -1,7 +1,5 @@ package zingg.client; -import static org.apache.spark.sql.functions.col; -import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.Arrays; @@ -19,7 +17,6 @@ import zingg.spark.client.SparkFrame; import zingg.spark.client.util.SparkDFObjectUtil; -import static org.junit.jupiter.api.Assertions.assertEquals; public class TestSparkFrame extends TestZFrameBase, Row, Column, DataType> { public static final Log LOG = LogFactory.getLog(TestSparkFrame.class); public static IArguments args; diff --git a/spark/client/src/test/java/zingg/client/TestZFrameBase.java b/spark/client/src/test/java/zingg/client/TestZFrameBase.java index 48a07ec92..efcf4d02a 100644 --- a/spark/client/src/test/java/zingg/client/TestZFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestZFrameBase.java @@ -1,625 +1,760 @@ package zingg.client; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.*; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; - +import org.apache.spark.sql.functions; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.client.util.PojoToArrayConverter; -import static org.apache.spark.sql.functions.col; -import static org.junit.jupiter.api.Assertions.*; -import static org.junit.jupiter.api.Assertions.assertEquals; - -public abstract class TestZFrameBase { - - private final DFObjectUtil dfObjectUtil; - - public static final Log LOG = LogFactory.getLog(TestZFrameBase.class); - public static final String NEW_COLUMN = "newColumn"; - public static final String STR_RECID = "recid"; - - public TestZFrameBase(DFObjectUtil dfObjectUtil) { - this.dfObjectUtil = dfObjectUtil; - } - - - @Test - public void testCreateSparkDataFrameAndGetDF() throws Exception { - List sampleDataSet = createSampleDataList(); - - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - //assert rows - List pojoList = (List) zFrame.collectAsList(); - for(int idx = 0; idx < sampleDataSet.size(); idx++) { - assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); - } - } - - @Test - public void testColumnsNamesandCount() throws Exception { - List sampleDataSet = createSampleDataList(); - - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - //assert on fields - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); - Arrays.stream(zFrame.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); - } - - @Test - public void testSelectWithSingleColumnName() throws Exception { - List sampleDataSet = createSampleDataList(); //List - - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - String colName = "recid"; - List pojoList = (List) zFrame.select(colName).collectAsList(); - - for (int idx = 0; idx < sampleDataSet.size(); idx++){ - assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - } - } - - @Test - public void testSelectWithColumnList() throws Exception { - List sampleDataSet = createSampleDataList(); //List - - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - List columnList = Arrays.asList(col("recid"), col("surname"), col("postcode")); - - List pojoList = (List) zFrame.select((List) columnList).collectAsList(); - - for(int idx = 0; idx < sampleDataSet.size(); idx++) { - assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); - } - } - - @Test - public void testSelectWithColumnArray() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - Column[] columnArray = new Column[] { col("recid"), col("surname"), col("postcode") }; - - List pojoList = (List) zFrame.select((C)columnArray).collectAsList(); - - for(int idx = 0; idx < sampleDataSet.size(); idx++) { - assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); - } - } - - @Test - public void testSelectWithMultipleColumnNamesAsString() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - List pojoList = (List) zFrame.select("recid", "surname", "postcode").collectAsList(); - - for(int idx = 0; idx < sampleDataSet.size(); idx++) { - assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); - } - } - - @Test - public void testSelectExprByPassingColumnStringsAsInSQLStatement() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - List pojoList = (List) zFrame.selectExpr("recid as RecordId", "surname as FamilyName", - "postcode as Pin").collectAsList(); +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; - for(int idx = 0; idx < sampleDataSet.size(); idx++) { - assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); - } - } - - @Test - public void testDropSingleColumn() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - List fieldsInZFrame = new ArrayList<>(); - List fieldsInTestData = new ArrayList<>(); - Arrays.stream(zFrame.drop("recid").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); - fieldsInTestData.remove("recid"); - - assertEquals(fieldsInTestData, fieldsInZFrame, "Fields in zFrame and sample data doesn't match"); - } - - @Test - public void testDropColumnsAsStringArray() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - List fieldsInZFrame = new ArrayList<>(); - List fieldsInTestData = new ArrayList<>(); - Arrays.stream(zFrame.drop("recid", "surname", "postcode").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); - fieldsInTestData.remove("recid"); - fieldsInTestData.remove("surname"); - fieldsInTestData.remove("postcode"); - - assertEquals(fieldsInTestData, fieldsInZFrame, "Fields in zFrame and sample data doesn't match"); - } - - @Test - public void testLimit() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - int len = 5; - List pojoList = (List) zFrame.limit(len).collectAsList(); - - assertEquals(pojoList.size(), len, "Size is not equal"); - - //assert on rows - for(int idx = 0; idx < len; idx++) { - assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); - } - } - - @Test - public void testHead() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - GenericRowWithSchema row = (GenericRowWithSchema) zFrame.head(); - - assertArrayEquals(row.values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(0)), - "Top row from zFrame and sample data doesn't match"); - } - - @Test - public void testGetAsInt() throws Exception { - List sampleDataSet = createSampleDataListWithMixedDataType(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); - - R row = zFrame.head(); - assertTrue(zFrame.getAsInt(row, "recid") == sampleDataSet.get(0).recid, - "row.getAsInt(col) hasn't returned correct int value"); - } - - @Test - public void testGetAsString() throws Exception { - List sampleDataSet = createSampleDataListWithMixedDataType(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); - - R row = zFrame.head(); - assertEquals(zFrame.getAsString(row, "surname"), sampleDataSet.get(0).surname, "row.getAsString(col) hasn't returned correct string value"); - } - - @Test - public void testGetAsDouble() throws Exception { - List sampleDataSet = createSampleDataListWithMixedDataType(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); - - R row = zFrame.head(); - assertEquals(zFrame.getAsDouble(row, "cost"), sampleDataSet.get(0).cost, "row.getAsDouble(col) hasn't returned correct double value"); - } - - @Test - public void testWithColumnForIntegerValue() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - - String newCol = NEW_COLUMN; - int newColVal = 36; - ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); - - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); - Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); - fieldsInTestData.add(newCol); - - //Assert on columns - assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); - - //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - assertEquals(row.getAs(newCol), Integer.valueOf(newColVal), "value of added column is not as expected"); - } - - @Test - public void testWithColumnForDoubleValue() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - String newCol = NEW_COLUMN; - double newColVal = 3.14; - ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); - - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); - Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); - fieldsInTestData.add(newCol); - - //Assert on columns - assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); - - //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - assertEquals(row.getAs(newCol), Double.valueOf(newColVal), "value of added column is not as expected"); - } - - @Test - public void testWithColumnForStringValue() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - String newCol = NEW_COLUMN; - String newColVal = "zingg"; - ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); - - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); - Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); - fieldsInTestData.add(newCol); - - //Assert on columns - assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); - - //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - assertEquals(row.getAs(newCol), newColVal, "value of added column is not as expected"); - } - - @Test - public void testWithColumnforAnotherColumn() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - String oldCol = STR_RECID; - String newCol = NEW_COLUMN; - ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, (C) col(oldCol)); - - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); - Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); - fieldsInTestData.add(newCol); - - //Assert on columns - assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); - - //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - assertEquals(Optional.of(row.getAs(newCol)), Optional.of(row.getAs(oldCol)), "value of added column is not as expected"); - } - - @Test - public void testGetMaxVal() throws Exception { - List sampleDataSet = createSampleDataZScore(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); - - assertEquals(400, zFrame.getMaxVal(ColName.CLUSTER_COLUMN), "Max value is not as expected"); - } - - @Test - public void testGroupByMinMax() throws Exception { - List sampleDataSet = createSampleDataZScore(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); - - ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.ID_COL)); - -// Dataset assertionDF = groupByDF.df(); -// List assertionRows = assertionDF.collectAsList(); -// for (Row row : assertionRows) { -// if(row.getInt(0)==1) { -// assertEquals(1001,row.getInt(1)); -// assertEquals(2002,row.getInt(2)); -// } -// } - } - - @Test - public void testGroupByMinMax2() throws Exception { - List sampleDataSet = createSampleDataZScore(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); - - ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.CLUSTER_COLUMN)); - -// Dataset assertionDF = groupByDF.df(); -// List assertionRows = assertionDF.collectAsList(); -// for (Row row : assertionRows) { -// if(row.getInt(0)==100) { -// assertEquals(900,row.getInt(1)); -// assertEquals(9002,row.getInt(2)); -// } -// } - } - - @Test - public void testRightJoinMultiCol() throws Exception { - List sampleDataSetInput = createSampleDataInput(); //List - ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); - List sampleDataSetCluster = createSampleDataCluster(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); - - ZFrame joinedData = zFrameCluster.join(zFrameInput,ColName.ID_COL,ColName.SOURCE_COL,ZFrame.RIGHT_JOIN); - assertEquals(10,joinedData.count()); - } - - @Test - public void testFilterInCond() throws Exception { - List sampleDataSetInput = createSampleDataInput(); //List - ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); - ZFrame filteredData = zFrameInput.filterInCond(ColName.ID_COL, zFrameCluster, ColName.COL_PREFIX+ ColName.ID_COL); - assertEquals(5,filteredData.count()); - } - - @Test - public void testFilterNotNullCond() throws Exception { - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); - - ZFrame filteredData = zFrameCluster.filterNotNullCond(ColName.SOURCE_COL); - assertEquals(3,filteredData.count()); - } - - @Test - public void testFilterNullCond() throws Exception { - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); - - ZFrame filteredData = zFrameCluster.filterNullCond(ColName.SOURCE_COL); - assertEquals(2, filteredData.count()); - } - - @Test - public void testDropDuplicatesConsideringGivenColumnsAsStringArray() throws Exception{ - List sampleDataSetCluster = createSampleDataList(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Schema.class); - - String[] columnArray = new String[] {"surname", "postcode"}; - ZFrame zFrameDeDuplicated = zFrame.dropDuplicates(columnArray); - -// ZFrame, Row, Column> sf2 = new SparkFrame(df.dropDuplicates(columnArray)); -// assertTrueCheckingExceptOutput(sf2, sf.dropDuplicates(columnArray), "SparkFrame.dropDuplicates(str[]) does not match with standard dropDuplicates(str[]) output"); - } - - @Test - public void testDropDuplicatesConsideringGivenIndividualColumnsAsString() throws Exception { - List sampleDataSetCluster = createSampleDataList(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Schema.class); - ZFrame zFrameDeDuplicated = zFrame.dropDuplicates("surname", "postcode"); - -// ZFrame, Row, Column> sf2 = new SparkFrame(df.dropDuplicates("surname", "postcode")); -// assertTrueCheckingExceptOutput(sf2, sf.dropDuplicates("surname"), "SparkFrame.dropDuplicates(col1, col2) does not match with standard dropDuplicates(col1, col2) output"); - } - - @Test - public void testSortDescending() throws Exception { - List sampleData = createSampleDataListWithMixedDataType(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); - - String col = STR_RECID; - ZFrame zFrameSortedDesc = zFrame.sortDescending(col); - -// SparkFrame sf = new SparkFrame(df); -// String col = STR_RECID; -// ZFrame,Row,Column> sf2 = sf.sortDescending(col); -// assertTrueCheckingExceptOutput(sf2, df.sort(functions.desc(col)), "SparkFrame.sortDescending() output is not as expected"); - } - - @Test - public void testSortAscending() throws Exception { - List sampleData = createSampleDataListWithMixedDataType(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); - - String col = STR_RECID; - ZFrame zFrameSortedDesc = zFrame.sortAscending(col); - -// SparkFrame sf = new SparkFrame(df); -// String col = STR_RECID; -// ZFrame,Row,Column> sf2 = sf.sortAscending(col); -// assertTrueCheckingExceptOutput(sf2, df.sort(functions.asc(col)), "SparkFrame.sortAscending() output is not as expected"); - } - - @Test - public void testIsEmpty() { -// ZFrame -// Dataset df = sparkSession.emptyDataFrame(); -// SparkFrame sf = new SparkFrame(df); -// assertTrue(sf.isEmpty(), "DataFrame is not empty"); - } - - @Test - public void testDistinct() throws Exception { - List sampleDataSet = createSampleDataList(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - zFrame.distinct(); -// SparkFrame sf = new SparkFrame(df); -// SparkFrame sf2 = new SparkFrame(df.distinct()); -// assertTrueCheckingExceptOutput(sf.distinct(), sf2, "SparkFrame.distict() does not match with standard distict() output"); - } - - //sample data to be used for testing - public static List createSampleDataList() { - List sample = new ArrayList(); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); - sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); - sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); - sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); - - sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); - sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); - sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); - sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); - sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); - - return sample; - } - - public static List createSampleDataListWithMixedDataType() { - List sample = new ArrayList(); - sample.add(new SchemaWithMixedDataType(7317257, "erjc", "henson", 10.021, 2873)); - sample.add(new SchemaWithMixedDataType(3102490, "jhon", "kozak", 3.2434, 28792)); - sample.add(new SchemaWithMixedDataType(2890805, "david", "pisczek", 5436.0232, 27717)); - sample.add(new SchemaWithMixedDataType(4437063, "e5in", "bbrown", 67.0, 27858)); - sample.add(new SchemaWithMixedDataType(3211564, "susan", "jones", 7343.2324, 2747)); - - sample.add(new SchemaWithMixedDataType(4155808, "jerome", "wilkins", 50.34, 2780)); - sample.add(new SchemaWithMixedDataType(5723231, "clarinw", "pastoreus", 87.2323, 27909)); - sample.add(new SchemaWithMixedDataType(6087743, "william", "craven", 834.123, 27405)); - sample.add(new SchemaWithMixedDataType(538491, "marh", "jackdon", 123.123, 27406)); - sample.add(new SchemaWithMixedDataType(1306702, "vonnell", "palmer", 83.123, 2734)); - - return sample; - } - - public static List createSampleDataZScore() { - - List sample = new ArrayList<>(); - sample.add(new SchemaZScore(0, 100, 900)); - sample.add(new SchemaZScore(1, 100, 101)); - sample.add(new SchemaZScore(1, 100, 2001)); - sample.add(new SchemaZScore(1, 100, 2002)); - sample.add(new SchemaZScore(11, 100, 2002)); - sample.add(new SchemaZScore(3, 300, 3002)); - sample.add(new SchemaZScore(3, 400, 4002)); - - return sample; - } - - public static List createSampleDataCluster() { - - List sample = new ArrayList<>(); - sample.add(new SchemaCluster(1,100,1001,"b")); - sample.add(new SchemaCluster(2,100,1002,"a")); - sample.add(new SchemaCluster(3,100,2001,"b")); - sample.add(new SchemaCluster(4, 900, 2002, "c")); - sample.add(new SchemaCluster(5, 111, 9002, "c")); - - return sample; - } - - public static List createSampleDataClusterWithNull() { - - List sample = new ArrayList<>(); - sample.add(new SchemaCluster( 1,100,1001,"b")); - sample.add(new SchemaCluster( 2,100,1002,"a")); - sample.add(new SchemaCluster( 3,100,2001,null)); - sample.add(new SchemaCluster(4,900,2002,"c")); - sample.add(new SchemaCluster( 5,111,9002,null)); - - return sample; - } - - public static List createSampleDataInput() { - - List sample = new ArrayList<>(); - sample.add(new SchemaInput(1,"fname1","b" )); - sample.add(new SchemaInput( 2,"fname","a")); - sample.add(new SchemaInput(3,"fna","b")); - sample.add((new SchemaInput(4,"x","c"))); - sample.add(new SchemaInput(5,"y","c")); - sample.add(new SchemaInput(11,"new1","b")); - sample.add(new SchemaInput(22,"new12","a")); - sample.add(new SchemaInput(33,"new13","b")); - sample.add(new SchemaInput( 44,"new14","c")); - sample.add(new SchemaInput(55,"new15","c")); - - return sample; - } - - - protected void assertTrueCheckingExceptOutput(ZFrame sf1, ZFrame sf2, String message) { - assertTrue(sf1.except(sf2).isEmpty(), message); - } - - //POJO classes for defining schema - - public static class Schema { - public final String recid; - public final String givenname; - public final String surname; - public final String suburb; - public final String postcode; - - public Schema(String recid, String givename, String surname, String suburb, String postcode) { - this.recid = recid; - this.givenname = givename; - this.surname = surname; - this.suburb = suburb; - this.postcode = postcode; - } - } - - public static class SchemaWithMixedDataType { - public final Integer recid; - public final String givenname; - public final String surname; - public final Double cost; - public final Integer postcode; - - public SchemaWithMixedDataType(Integer recid, String givename, String surname, Double cost, Integer postcode) { - this.recid = recid; - this.givenname = givename; - this.surname = surname; - this.cost = cost; - this.postcode = postcode; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public abstract class TestZFrameBase { + + public static final Log LOG = LogFactory.getLog(TestZFrameBase.class); + public static final String NEW_COLUMN = "newColumn"; + public static final String STR_RECID = "recid"; + private final DFObjectUtil dfObjectUtil; + + public TestZFrameBase(DFObjectUtil dfObjectUtil) { + this.dfObjectUtil = dfObjectUtil; + } + + + @Test + public void testCreateSparkDataFrameAndGetDF() throws Exception { + List sampleDataSet = createSampleDataList(); + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + //assert rows + List pojoList = (List) zFrame.collectAsList(); + for (int idx = 0; idx < sampleDataSet.size(); idx++) { + Assertions.assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); + } + } + + @Test + public void testColumnsNamesandCount() throws Exception { + List sampleDataSet = createSampleDataList(); + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + //assert on fields + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(zFrame.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + assertEquals(fieldsInTestData, fieldsInZFrame, + "Columns of sample data and zFrame are not equal"); + } + + @Test + public void testSelectWithSingleColumnName() throws Exception { + List sampleDataSet = createSampleDataList(); //List + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String colName = "recid"; + List pojoList = (List) zFrame.select(colName).collectAsList(); + + for (int idx = 0; idx < sampleDataSet.size(); idx++) { + Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + "value from zFrame and sampleData doesn't match"); + } + } + + @Test + public void testSelectWithColumnList() throws Exception { + List sampleDataSet = createSampleDataList(); //List + + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List columnList = (List) Arrays.asList(functions.col("recid"), functions.col("surname"), functions.col("postcode")); + + List pojoList = (List) zFrame.select(columnList).collectAsList(); + + for (int idx = 0; idx < sampleDataSet.size(); idx++) { + Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + "value from zFrame and sampleData doesn't match"); + } + } + + @Disabled + @Test + public void testSelectWithColumnArray() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + C[] columnArray = (C[]) new Object[3]; + columnArray[0] = (C) functions.col("recid"); + columnArray[1] = (C) functions.col("surname"); + columnArray[2] = (C) functions.col("postcode"); + + List pojoList = (List) zFrame.select(columnArray).collectAsList(); + + for (int idx = 0; idx < sampleDataSet.size(); idx++) { + Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + "value from zFrame and sampleData doesn't match"); + } + } + + @Test + public void testSelectWithMultipleColumnNamesAsString() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List pojoList = (List) zFrame.select("recid", "surname", "postcode").collectAsList(); + + for (int idx = 0; idx < sampleDataSet.size(); idx++) { + Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + "value from zFrame and sampleData doesn't match"); + } + } + + @Test + public void testSelectExprByPassingColumnStringsAsInSQLStatement() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List pojoList = (List) zFrame.selectExpr("recid as RecordId", "surname as FamilyName", + "postcode as Pin").collectAsList(); + + for (int idx = 0; idx < sampleDataSet.size(); idx++) { + Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + "value from zFrame and sampleData doesn't match"); + Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + "value from zFrame and sampleData doesn't match"); + } + } + + @Test + public void testDropSingleColumn() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList<>(); + Arrays.stream(zFrame.drop("recid").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.remove("recid"); + + assertEquals(fieldsInTestData, fieldsInZFrame, "Fields in zFrame and sample data doesn't match"); + } + + @Test + public void testDropColumnsAsStringArray() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList<>(); + Arrays.stream(zFrame.drop("recid", "surname", "postcode").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.remove("recid"); + fieldsInTestData.remove("surname"); + fieldsInTestData.remove("postcode"); + + assertEquals(fieldsInTestData, fieldsInZFrame, + "Fields in zFrame and sample data doesn't match"); + } + + @Test + public void testLimit() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + int len = 5; + List pojoList = (List) zFrame.limit(len).collectAsList(); + + assertEquals(pojoList.size(), len, "Size is not equal"); + + //assert on rows + for (int idx = 0; idx < len; idx++) { + Assertions.assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); + } + } + + @Test + public void testHead() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + GenericRowWithSchema row = (GenericRowWithSchema) zFrame.head(); + + Assertions.assertArrayEquals(row.values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(0)), + "Top row from zFrame and sample data doesn't match"); + } + + @Test + public void testGetAsInt() throws Exception { + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + + R row = zFrame.head(); + assertTrue(zFrame.getAsInt(row, "recid") == sampleDataSet.get(0).recid, + "row.getAsInt(col) hasn't returned correct int value"); + } + + @Test + public void testGetAsString() throws Exception { + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + + R row = zFrame.head(); + assertEquals(zFrame.getAsString(row, "surname"), sampleDataSet.get(0).surname, + "row.getAsString(col) hasn't returned correct string value"); + } + + @Test + public void testGetAsDouble() throws Exception { + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + + R row = zFrame.head(); + assertEquals(zFrame.getAsDouble(row, "cost"), sampleDataSet.get(0).cost, + "row.getAsDouble(col) hasn't returned correct double value"); + } + + @Test + public void testWithColumnForIntegerValue() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + + String newCol = NEW_COLUMN; + int newColVal = 36; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, + "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + Assertions.assertEquals(row.getAs(newCol), Integer.valueOf(newColVal), + "value of added column is not as expected"); + } + + @Test + public void testWithColumnForDoubleValue() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String newCol = NEW_COLUMN; + double newColVal = 3.14; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, + "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + Assertions.assertEquals(row.getAs(newCol), Double.valueOf(newColVal), + "value of added column is not as expected"); + } + + @Test + public void testWithColumnForStringValue() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String newCol = NEW_COLUMN; + String newColVal = "zingg"; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, + "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + Assertions.assertEquals(row.getAs(newCol), newColVal, + "value of added column is not as expected"); + } + + @Test + public void testWithColumnforAnotherColumn() throws Exception { + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + String oldCol = STR_RECID; + String newCol = NEW_COLUMN; + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, (C) functions.col(oldCol)); + + List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList<>(); + Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); + Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + fieldsInTestData.add(newCol); + + //Assert on columns + assertEquals(fieldsInTestData, fieldsInZFrame, + "Columns of sample data and zFrame are not equal"); + + //Assert on first row + GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); + assertEquals(Optional.of(row.getAs(newCol)), Optional.of(row.getAs(oldCol)), + "value of added column is not as expected"); + } + + @Test + public void testGetMaxVal() throws Exception { + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + + assertEquals(400, zFrame.getMaxVal(ColName.CLUSTER_COLUMN), + "Max value is not as expected"); + } + + @Test + public void testGroupByMinMax() throws Exception { + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + + ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.ID_COL)); + + List assertionRows = (List) groupByDF.collectAsList(); + for (GenericRowWithSchema row : assertionRows) { + if(row.getInt(0)==1) { + Assertions.assertEquals(1001,row.getInt(1)); + Assertions.assertEquals(2002,row.getInt(2)); + } } - } + } - public static class SchemaZScore { - public final Integer z_zid; - public final Integer z_cluster; - public final Integer z_score; - - public SchemaZScore(Integer z_zid, Integer z_cluster, Integer z_score) { - this.z_zid = z_zid; - this.z_cluster = z_cluster; - this.z_score = z_score; - } - } - - public static class SchemaCluster { - public final Integer z_zid; - public final Integer z_cluster; - public final Integer z_score; - public final String z_zsource; - - public SchemaCluster(Integer z_zid, Integer z_cluster, Integer z_score, String z_zsource) { - this.z_zid = z_zid; - this.z_cluster = z_cluster; - this. z_score = z_score; - this.z_zsource = z_zsource; - } - } + @Test + public void testGroupByMinMax2() throws Exception { + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); - public static class SchemaInput { - public final Integer z_zid; - public final String fname; - public final String z_zsource; + ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.CLUSTER_COLUMN)); - public SchemaInput(Integer z_zid, String fname, String z_zsource) { - this.z_zid = z_zid; - this.fname = fname; - this.z_zsource = z_zsource; + List assertionRows = (List) groupByDF.collectAsList(); + for (GenericRowWithSchema row : assertionRows) { + if(row.getInt(0)==100) { + Assertions.assertEquals(900,row.getInt(1)); + Assertions.assertEquals(9002,row.getInt(2)); + } } - } + } + + @Test + public void testRightJoinMultiCol() throws Exception { + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); + List sampleDataSetCluster = createSampleDataCluster(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); + + ZFrame joinedData = zFrameCluster.join(zFrameInput, ColName.ID_COL, ColName.SOURCE_COL, ZFrame.RIGHT_JOIN); + assertEquals(10, joinedData.count()); + } + + @Test + public void testFilterInCond() throws Exception { + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaClusterNull.class); + ZFrame filteredData = zFrameInput.filterInCond(ColName.ID_COL, zFrameCluster, ColName.COL_PREFIX + ColName.ID_COL); + assertEquals(5, filteredData.count()); + } + + @Test + public void testFilterNotNullCond() throws Exception { + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaClusterNull.class); + + ZFrame filteredData = zFrameCluster.filterNotNullCond(ColName.SOURCE_COL); + assertEquals(3, filteredData.count()); + } + + @Test + public void testFilterNullCond() throws Exception { + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaClusterNull.class); + + ZFrame filteredData = zFrameCluster.filterNullCond(ColName.SOURCE_COL); + assertEquals(2, filteredData.count()); + } + + @Test + public void testDropDuplicatesConsideringGivenColumnsAsStringArray() throws Exception { + List sampleData = createSampleDataList(); + List sampleDataWithDistinctSurnameAndPostCode = createSampleDataListWithDistinctSurnameAndPostcode(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, Schema.class); + + String[] columnArray = new String[]{"surname", "postcode"}; + ZFrame zFrameDeDuplicated = zFrame.dropDuplicates(columnArray); + + List pojoList = (List) zFrameDeDuplicated.collectAsList(); + + List rowsInZFrameDistinct = new ArrayList<>(); + List rowsInSampleDataDistinct = new ArrayList<>(); + pojoList.forEach(entry -> rowsInZFrameDistinct.add(entry.values())); + for (Schema entry : sampleDataWithDistinctSurnameAndPostCode) { + rowsInSampleDataDistinct.add(PojoToArrayConverter.getObjectArray(entry)); + } + + int matchedRowCount = 0; + for(Object[] rowInZFrameDistinct : rowsInZFrameDistinct) { + for (Object[] rowInSampleDataDistinct : rowsInSampleDataDistinct) { + if (Arrays.equals(Arrays.stream(rowInZFrameDistinct).toArray(), Arrays.stream(rowInSampleDataDistinct).toArray())) { + matchedRowCount++; + } + } + } + + assertEquals(rowsInSampleDataDistinct.size(), matchedRowCount, + "Rows from zFrame and sample data doesn't match after drop duplicates"); + assertEquals(rowsInZFrameDistinct.size(), matchedRowCount, + "Rows from zFrame and sample data doesn't match after drop duplicates"); + } + + @Test + public void testDropDuplicatesConsideringGivenIndividualColumnsAsString() throws Exception { + List sampleDataSetCluster = createSampleDataList(); + List sampleDataWithDistinctSurnameAndPostCode = createSampleDataListWithDistinctSurnameAndPostcode(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Schema.class); + ZFrame zFrameDeDuplicated = zFrame.dropDuplicates("surname", "postcode"); + + List pojoList = (List) zFrameDeDuplicated.collectAsList(); + + List rowsInZFrameDistinct = new ArrayList<>(); + List rowsInSampleDataDistinct = new ArrayList<>(); + pojoList.forEach(entry -> rowsInZFrameDistinct.add(entry.values())); + for (Schema entry : sampleDataWithDistinctSurnameAndPostCode) { + rowsInSampleDataDistinct.add(PojoToArrayConverter.getObjectArray(entry)); + } + + int matchedRowCount = 0; + for(Object[] rowInZFrameDistinct : rowsInZFrameDistinct) { + for (Object[] rowInSampleDataDistinct : rowsInSampleDataDistinct) { + if (Arrays.equals(Arrays.stream(rowInZFrameDistinct).toArray(), Arrays.stream(rowInSampleDataDistinct).toArray())) { + matchedRowCount++; + } + } + } + + assertEquals(rowsInSampleDataDistinct.size(), matchedRowCount, + "Rows from zFrame and sample data doesn't match after drop duplicates"); + assertEquals(rowsInZFrameDistinct.size(), matchedRowCount, + "Rows from zFrame and sample data doesn't match after drop duplicates"); + } + + @Test + public void testSortDescending() throws Exception { + List sampleData = createSampleDataListWithMixedDataType(); + sampleData.sort((a, b) -> a.recid > b.recid ? -1 : 1); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); + + String col = STR_RECID; + ZFrame zFrameSortedDesc = zFrame.sortDescending(col); + List pojoList = (List) zFrameSortedDesc.collectAsList(); + + for(int idx = 0; idx < sampleData.size(); idx++) { + Assertions.assertArrayEquals(PojoToArrayConverter.getObjectArray(sampleData.get(idx)), pojoList.get(idx).values(), + "Row from descending sorted sample data is not equal to row from descending sorted zFrame"); + } + } + + @Test + public void testSortAscending() throws Exception { + List sampleData = createSampleDataListWithMixedDataType(); + sampleData.sort((a, b) -> a.recid < b.recid ? -1 : 1); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); + + String col = STR_RECID; + ZFrame zFrameSortedAsc = zFrame.sortAscending(col); + List pojoList = (List) zFrameSortedAsc.collectAsList(); + + for(int idx = 0; idx < sampleData.size(); idx++) { + Assertions.assertArrayEquals(PojoToArrayConverter.getObjectArray(sampleData.get(idx)), pojoList.get(idx).values(), + "Row from ascending sorted sample data is not equal to row from ascending sorted zFrame"); + } + } + + @Test + public void testIsEmpty() throws Exception { + List emptySampleData = createEmptySampleData(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(emptySampleData, Schema.class); + + assertTrue(zFrame.isEmpty(), "zFrame is not empty"); + } + + @Test + public void testDistinct() throws Exception { + List sampleData = createSampleDataList(); + List sampleDataDistinct = createSampleDataListDistinct(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, Schema.class); + + List pojoList = (List) zFrame.distinct().collectAsList(); + + for(int idx = 0; idx < sampleDataDistinct.size(); idx++) { + Assertions.assertArrayEquals(PojoToArrayConverter.getObjectArray(sampleDataDistinct.get(idx)), pojoList.get(idx).values(), + "Row from sample data is not equal to row from zFrame"); + } + } + + //sample data to be used for testing + + public static List createEmptySampleData() { + + return new ArrayList<>(); + } + + public static List createSampleDataList() { + List sample = new ArrayList(); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + + return sample; + } + + public static List createSampleDataListDistinct() { + List sample = new ArrayList(); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + + return sample; + } + + public static List createSampleDataListWithDistinctSurnameAndPostcode() { + List sample = new ArrayList(); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + + return sample; + } + + public static List createSampleDataListWithMixedDataType() { + List sample = new ArrayList(); + sample.add(new SchemaWithMixedDataType(7317257, "erjc", "henson", 10.021, 2873)); + sample.add(new SchemaWithMixedDataType(3102490, "jhon", "kozak", 3.2434, 28792)); + sample.add(new SchemaWithMixedDataType(2890805, "david", "pisczek", 5436.0232, 27717)); + sample.add(new SchemaWithMixedDataType(4437063, "e5in", "bbrown", 67.0, 27858)); + sample.add(new SchemaWithMixedDataType(3211564, "susan", "jones", 7343.2324, 2747)); + + sample.add(new SchemaWithMixedDataType(4155808, "jerome", "wilkins", 50.34, 2780)); + sample.add(new SchemaWithMixedDataType(5723231, "clarinw", "pastoreus", 87.2323, 27909)); + sample.add(new SchemaWithMixedDataType(6087743, "william", "craven", 834.123, 27405)); + sample.add(new SchemaWithMixedDataType(538491, "marh", "jackdon", 123.123, 27406)); + sample.add(new SchemaWithMixedDataType(1306702, "vonnell", "palmer", 83.123, 2734)); + + return sample; + } + + public static List createSampleDataZScore() { + + List sample = new ArrayList<>(); + sample.add(new SchemaZScore(0, 100, 900)); + sample.add(new SchemaZScore(1, 100, 1001)); + sample.add(new SchemaZScore(1, 100, 1002)); + sample.add(new SchemaZScore(1, 100, 2001)); + sample.add(new SchemaZScore(1, 100, 2002)); + sample.add(new SchemaZScore(11, 100, 9002)); + sample.add(new SchemaZScore(3, 300, 3001)); + sample.add(new SchemaZScore(3, 300, 3002)); + sample.add(new SchemaZScore(3, 400, 4001)); + sample.add(new SchemaZScore(4, 400, 4002)); + + return sample; + } + + public static List createSampleDataCluster() { + + List sample = new ArrayList<>(); + sample.add(new SchemaCluster(1, 100, 1001, "b")); + sample.add(new SchemaCluster(2, 100, 1002, "a")); + sample.add(new SchemaCluster(3, 100, 2001, "b")); + sample.add(new SchemaCluster(4, 900, 2002, "c")); + sample.add(new SchemaCluster(5, 111, 9002, "c")); + + return sample; + } + + public static List createSampleDataClusterWithNull() { + + List sample = new ArrayList<>(); + sample.add(new SchemaClusterNull(1, 100, 1001, "b")); + sample.add(new SchemaClusterNull(2, 100, 1002, "a")); + sample.add(new SchemaClusterNull(3, 100, 2001, null)); + sample.add(new SchemaClusterNull(4, 900, 2002, "c")); + sample.add(new SchemaClusterNull(5, 111, 9002, null)); + + return sample; + } + + public static List createSampleDataInput() { + + List sample = new ArrayList<>(); + sample.add(new SchemaInput(1, "fname1", "b")); + sample.add(new SchemaInput(2, "fname", "a")); + sample.add(new SchemaInput(3, "fna", "b")); + sample.add((new SchemaInput(4, "x", "c"))); + sample.add(new SchemaInput(5, "y", "c")); + sample.add(new SchemaInput(11, "new1", "b")); + sample.add(new SchemaInput(22, "new12", "a")); + sample.add(new SchemaInput(33, "new13", "b")); + sample.add(new SchemaInput(44, "new14", "c")); + sample.add(new SchemaInput(55, "new15", "c")); + + return sample; + } + + protected void assertTrueCheckingExceptOutput(ZFrame sf1, ZFrame sf2, String message) { + assertTrue(sf1.except(sf2).isEmpty(), message); + } + + //POJO classes for defining schema + + public static class Schema { + public final String recid; + public final String givenname; + public final String surname; + public final String suburb; + public final String postcode; + + public Schema(String recid, String givename, String surname, String suburb, String postcode) { + this.recid = recid; + this.givenname = givename; + this.surname = surname; + this.suburb = suburb; + this.postcode = postcode; + } + } + + public static class SchemaWithMixedDataType { + public final Integer recid; + public final String givenname; + public final String surname; + public final Double cost; + public final Integer postcode; + + public SchemaWithMixedDataType(Integer recid, String givename, String surname, Double cost, Integer postcode) { + this.recid = recid; + this.givenname = givename; + this.surname = surname; + this.cost = cost; + this.postcode = postcode; + } + } + + public static class SchemaZScore { + public final Integer z_zid; + public final Integer z_cluster; + public final Integer z_score; + + public SchemaZScore(Integer z_zid, Integer z_cluster, Integer z_score) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + } + } + + public static class SchemaClusterNull { + public final Integer z_z_zid; + public final Integer z_cluster; + public final Integer z_score; + public final String z_zsource; + + public SchemaClusterNull(Integer z_z_zid, Integer z_cluster, Integer z_score, String z_zsource) { + this.z_z_zid = z_z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + this.z_zsource = z_zsource; + } + } + + public static class SchemaCluster { + public final Integer z_zid; + public final Integer z_cluster; + public final Integer z_score; + public final String z_zsource; + + public SchemaCluster(Integer z_zid, Integer z_cluster, Integer z_score, String z_zsource) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + this.z_zsource = z_zsource; + } + } + + public static class SchemaInput { + public final Integer z_zid; + public final String fname; + public final String z_zsource; + + public SchemaInput(Integer z_zid, String fname, String z_zsource) { + this.z_zid = z_zid; + this.fname = fname; + this.z_zsource = z_zsource; + } + } } \ No newline at end of file From 5449bfc0b0d2147c91c187372899b96701e32348 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 02:14:35 +0530 Subject: [PATCH 154/219] replaced GenericRowWithSchema with R --- .../client/util/PojoToArrayConverter.java | 6 +- .../java/zingg/client/TestZFrameBase.java | 300 +++++++++++------- 2 files changed, 193 insertions(+), 113 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java b/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java index 5e7928521..9a47c7b43 100644 --- a/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java +++ b/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java @@ -4,8 +4,8 @@ public class PojoToArrayConverter { - public static Object[] getObjectArray(Object person) throws IllegalAccessException { - Field[] fields = person.getClass().getDeclaredFields(); + public static Object[] getObjectArray(Object object) throws IllegalAccessException { + Field[] fields = object.getClass().getDeclaredFields(); int fieldCount = fields.length; Object[] objArr = new Object[fieldCount]; @@ -13,7 +13,7 @@ public static Object[] getObjectArray(Object person) throws IllegalAccessExcepti Field field = fields[i]; field.setAccessible(true); - objArr[i] = field.get(person); + objArr[i] = field.get(object); } return objArr; diff --git a/spark/client/src/test/java/zingg/client/TestZFrameBase.java b/spark/client/src/test/java/zingg/client/TestZFrameBase.java index efcf4d02a..c0d24877b 100644 --- a/spark/client/src/test/java/zingg/client/TestZFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestZFrameBase.java @@ -2,22 +2,20 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; -import org.apache.spark.sql.functions; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; -import zingg.common.client.util.PojoToArrayConverter; +import zingg.common.core.ZinggException; +import java.lang.reflect.Field; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Optional; -import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -40,9 +38,15 @@ public void testCreateSparkDataFrameAndGetDF() throws Exception { ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); //assert rows - List pojoList = (List) zFrame.collectAsList(); + List rows = zFrame.collectAsList(); + List fields = List.of(Schema.class.getDeclaredFields()); for (int idx = 0; idx < sampleDataSet.size(); idx++) { - Assertions.assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); + R row = rows.get(idx); + for (Field column : fields) { + String columnName = column.getName(); + assertEquals(column.get(sampleDataSet.get(idx)).toString(), zFrame.getAsString(row, columnName), + "value in ZFrame and sample input is not same"); + } } } @@ -67,33 +71,43 @@ public void testSelectWithSingleColumnName() throws Exception { ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); String colName = "recid"; - List pojoList = (List) zFrame.select(colName).collectAsList(); - + List rows = zFrame.select(colName).collectAsList(); for (int idx = 0; idx < sampleDataSet.size(); idx++) { - Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, - "value from zFrame and sampleData doesn't match"); + R row = rows.get(idx); + assertEquals(sampleDataSet.get(idx).recid, zFrame.getAsString(row, colName), + "value in ZFrame and sample input is not same"); } } + /* + list of string can not be cast to list of C + zFrame select does not have an interface method for List + */ + @Disabled @Test public void testSelectWithColumnList() throws Exception { List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - List columnList = (List) Arrays.asList(functions.col("recid"), functions.col("surname"), functions.col("postcode")); - List pojoList = (List) zFrame.select(columnList).collectAsList(); + List columnList = (List) Arrays.asList("recid", "surname", "postcode"); + List rows = zFrame.select(columnList).collectAsList(); for (int idx = 0; idx < sampleDataSet.size(); idx++) { - Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + R row = rows.get(idx); + Assertions.assertEquals(zFrame.getAsString(row, "recid"), sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + Assertions.assertEquals(zFrame.getAsString(row, "surname"), sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + Assertions.assertEquals(zFrame.getAsString(row, "postcode"), sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } } + /* + string can not be cast to C + zFrame doesn't have an interface method for C[] + */ @Disabled @Test public void testSelectWithColumnArray() throws Exception { @@ -101,18 +115,19 @@ public void testSelectWithColumnArray() throws Exception { ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); C[] columnArray = (C[]) new Object[3]; - columnArray[0] = (C) functions.col("recid"); - columnArray[1] = (C) functions.col("surname"); - columnArray[2] = (C) functions.col("postcode"); + columnArray[0] = (C) "recid"; + columnArray[1] = (C) "surname"; + columnArray[2] = (C) "postcode"; - List pojoList = (List) zFrame.select(columnArray).collectAsList(); + List rows = zFrame.select(columnArray).collectAsList(); for (int idx = 0; idx < sampleDataSet.size(); idx++) { - Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + R row = rows.get(idx); + Assertions.assertEquals(zFrame.getAsString(row, "recid"), sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + Assertions.assertEquals(zFrame.getAsString(row, "surname"), sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + Assertions.assertEquals(zFrame.getAsString(row, "postcode"), sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } } @@ -122,14 +137,15 @@ public void testSelectWithMultipleColumnNamesAsString() throws Exception { List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - List pojoList = (List) zFrame.select("recid", "surname", "postcode").collectAsList(); + List rows = zFrame.select("recid", "surname", "postcode").collectAsList(); for (int idx = 0; idx < sampleDataSet.size(); idx++) { - Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + R row = rows.get(idx); + Assertions.assertEquals(zFrame.getAsString(row, "recid"), sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + Assertions.assertEquals(zFrame.getAsString(row, "surname"), sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + Assertions.assertEquals(zFrame.getAsString(row, "postcode"), sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } } @@ -139,15 +155,16 @@ public void testSelectExprByPassingColumnStringsAsInSQLStatement() throws Except List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - List pojoList = (List) zFrame.selectExpr("recid as RecordId", "surname as FamilyName", + List rows = zFrame.selectExpr("recid as RecordId", "surname as FamilyName", "postcode as Pin").collectAsList(); for (int idx = 0; idx < sampleDataSet.size(); idx++) { - Assertions.assertEquals(pojoList.get(idx).values()[0], sampleDataSet.get(idx).recid, + R row = rows.get(idx); + Assertions.assertEquals(zFrame.getAsString(row, "RecordId"), sampleDataSet.get(idx).recid, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[1], sampleDataSet.get(idx).surname, + Assertions.assertEquals(zFrame.getAsString(row, "FamilyName"), sampleDataSet.get(idx).surname, "value from zFrame and sampleData doesn't match"); - Assertions.assertEquals(pojoList.get(idx).values()[2], sampleDataSet.get(idx).postcode, + Assertions.assertEquals(zFrame.getAsString(row, "Pin"), sampleDataSet.get(idx).postcode, "value from zFrame and sampleData doesn't match"); } } @@ -188,13 +205,19 @@ public void testLimit() throws Exception { List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); int len = 5; - List pojoList = (List) zFrame.limit(len).collectAsList(); + List rows = zFrame.limit(len).collectAsList(); - assertEquals(pojoList.size(), len, "Size is not equal"); + assertEquals(rows.size(), len, "Size is not equal"); //assert on rows + List fields = List.of(Schema.class.getDeclaredFields()); for (int idx = 0; idx < len; idx++) { - Assertions.assertArrayEquals(pojoList.get(idx).values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(idx))); + R row = rows.get(idx); + for (Field column : fields) { + String columnName = column.getName(); + assertEquals(column.get(sampleDataSet.get(idx)).toString(), zFrame.getAsString(row, columnName), + "value in ZFrame and sample input is not same"); + } } } @@ -203,10 +226,13 @@ public void testHead() throws Exception { List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); - GenericRowWithSchema row = (GenericRowWithSchema) zFrame.head(); - - Assertions.assertArrayEquals(row.values(), PojoToArrayConverter.getObjectArray(sampleDataSet.get(0)), - "Top row from zFrame and sample data doesn't match"); + R row = zFrame.head(); + List fields = List.of(Schema.class.getDeclaredFields()); + for (Field column : fields) { + String columnName = column.getName(); + assertEquals(column.get(sampleDataSet.get(0)).toString(), zFrame.getAsString(row, columnName), + "value in ZFrame and sample input is not same"); + } } @Test @@ -259,8 +285,8 @@ public void testWithColumnForIntegerValue() throws Exception { "Columns of sample data and zFrame are not equal"); //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - Assertions.assertEquals(row.getAs(newCol), Integer.valueOf(newColVal), + R row = zFrameWithAddedColumn.head(); + Assertions.assertEquals(zFrame.getAsInt(row, newCol), Integer.valueOf(newColVal), "value of added column is not as expected"); } @@ -283,8 +309,8 @@ public void testWithColumnForDoubleValue() throws Exception { "Columns of sample data and zFrame are not equal"); //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - Assertions.assertEquals(row.getAs(newCol), Double.valueOf(newColVal), + R row = zFrameWithAddedColumn.head(); + Assertions.assertEquals(zFrame.getAsDouble(row, newCol), Double.valueOf(newColVal), "value of added column is not as expected"); } @@ -307,18 +333,18 @@ public void testWithColumnForStringValue() throws Exception { "Columns of sample data and zFrame are not equal"); //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - Assertions.assertEquals(row.getAs(newCol), newColVal, + R row = zFrameWithAddedColumn.head(); + Assertions.assertEquals(zFrame.getAsString(row, newCol), newColVal, "value of added column is not as expected"); } @Test - public void testWithColumnforAnotherColumn() throws Exception { + public void testWithColumnForAnotherColumn() throws Exception { List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); String oldCol = STR_RECID; String newCol = NEW_COLUMN; - ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, (C) functions.col(oldCol)); + ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, zFrame.col(oldCol)); List fieldsInTestData = new ArrayList<>(); List fieldsInZFrame = new ArrayList<>(); @@ -331,8 +357,8 @@ public void testWithColumnforAnotherColumn() throws Exception { "Columns of sample data and zFrame are not equal"); //Assert on first row - GenericRowWithSchema row = (GenericRowWithSchema) zFrameWithAddedColumn.head(); - assertEquals(Optional.of(row.getAs(newCol)), Optional.of(row.getAs(oldCol)), + R row = zFrameWithAddedColumn.head(); + assertEquals(Optional.of(zFrameWithAddedColumn.getAsString(row, newCol)), Optional.of(zFrameWithAddedColumn.getAsString(row, oldCol)), "value of added column is not as expected"); } @@ -352,12 +378,14 @@ public void testGroupByMinMax() throws Exception { ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.ID_COL)); - List assertionRows = (List) groupByDF.collectAsList(); - for (GenericRowWithSchema row : assertionRows) { - if(row.getInt(0)==1) { - Assertions.assertEquals(1001,row.getInt(1)); - Assertions.assertEquals(2002,row.getInt(2)); - } + List assertionRows = groupByDF.collectAsList(); + for (R row : assertionRows) { + if (groupByDF.getAsInt(row, "z_zid") == 1) { + assertEquals(1001, groupByDF.getAsInt(row, "z_minScore"), + "z_minScore is not as expected"); + assertEquals(2002, groupByDF.getAsInt(row, "z_maxScore"), + "z_maxScore is not as expected"); + } } } @@ -368,13 +396,15 @@ public void testGroupByMinMax2() throws Exception { ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.CLUSTER_COLUMN)); - List assertionRows = (List) groupByDF.collectAsList(); - for (GenericRowWithSchema row : assertionRows) { - if(row.getInt(0)==100) { - Assertions.assertEquals(900,row.getInt(1)); - Assertions.assertEquals(9002,row.getInt(2)); - } - } + List assertionRows = groupByDF.collectAsList(); + for (R row : assertionRows) { + if (groupByDF.getAsInt(row, "z_cluster") == 100) { + assertEquals(900, groupByDF.getAsInt(row, "z_minScore"), + "z_minScore is not as expected"); + assertEquals(9002, groupByDF.getAsInt(row, "z_maxScore"), + "z_maxScore is not as expected"); + } + } } @Test @@ -425,28 +455,33 @@ public void testDropDuplicatesConsideringGivenColumnsAsStringArray() throws Exce String[] columnArray = new String[]{"surname", "postcode"}; ZFrame zFrameDeDuplicated = zFrame.dropDuplicates(columnArray); - List pojoList = (List) zFrameDeDuplicated.collectAsList(); - - List rowsInZFrameDistinct = new ArrayList<>(); - List rowsInSampleDataDistinct = new ArrayList<>(); - pojoList.forEach(entry -> rowsInZFrameDistinct.add(entry.values())); - for (Schema entry : sampleDataWithDistinctSurnameAndPostCode) { - rowsInSampleDataDistinct.add(PojoToArrayConverter.getObjectArray(entry)); - } - - int matchedRowCount = 0; - for(Object[] rowInZFrameDistinct : rowsInZFrameDistinct) { - for (Object[] rowInSampleDataDistinct : rowsInSampleDataDistinct) { - if (Arrays.equals(Arrays.stream(rowInZFrameDistinct).toArray(), Arrays.stream(rowInSampleDataDistinct).toArray())) { - matchedRowCount++; + List rows = zFrameDeDuplicated.collectAsList(); + + List fields = List.of(Schema.class.getDeclaredFields()); + int matchedCount = 0; + for (Schema schema : sampleDataWithDistinctSurnameAndPostCode) { + for (R row : rows) { + boolean rowMatched = true; + for (Field column : fields) { + String columnName = column.getName(); + if (!column.get(schema).toString(). + equals(zFrame.getAsString(row, columnName))) { + rowMatched = false; + break; + } + } + if (rowMatched) { + matchedCount++; + break; } } } - assertEquals(rowsInSampleDataDistinct.size(), matchedRowCount, - "Rows from zFrame and sample data doesn't match after drop duplicates"); - assertEquals(rowsInZFrameDistinct.size(), matchedRowCount, - "Rows from zFrame and sample data doesn't match after drop duplicates"); + + assertEquals(rows.size(), matchedCount, + "rows count is not as expected"); + assertEquals(sampleDataWithDistinctSurnameAndPostCode.size(), matchedCount, + "rows count is not as expected"); } @Test @@ -456,28 +491,32 @@ public void testDropDuplicatesConsideringGivenIndividualColumnsAsString() throws ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Schema.class); ZFrame zFrameDeDuplicated = zFrame.dropDuplicates("surname", "postcode"); - List pojoList = (List) zFrameDeDuplicated.collectAsList(); - - List rowsInZFrameDistinct = new ArrayList<>(); - List rowsInSampleDataDistinct = new ArrayList<>(); - pojoList.forEach(entry -> rowsInZFrameDistinct.add(entry.values())); - for (Schema entry : sampleDataWithDistinctSurnameAndPostCode) { - rowsInSampleDataDistinct.add(PojoToArrayConverter.getObjectArray(entry)); - } - - int matchedRowCount = 0; - for(Object[] rowInZFrameDistinct : rowsInZFrameDistinct) { - for (Object[] rowInSampleDataDistinct : rowsInSampleDataDistinct) { - if (Arrays.equals(Arrays.stream(rowInZFrameDistinct).toArray(), Arrays.stream(rowInSampleDataDistinct).toArray())) { - matchedRowCount++; + List rows = zFrameDeDuplicated.collectAsList(); + List fields = List.of(Schema.class.getDeclaredFields()); + int matchedCount = 0; + for (Schema schema : sampleDataWithDistinctSurnameAndPostCode) { + for (R row : rows) { + boolean rowMatched = true; + for (Field column : fields) { + String columnName = column.getName(); + if (!column.get(schema).toString(). + equals(zFrame.getAsString(row, columnName))) { + rowMatched = false; + break; + } + } + if (rowMatched) { + matchedCount++; + break; } } } - assertEquals(rowsInSampleDataDistinct.size(), matchedRowCount, - "Rows from zFrame and sample data doesn't match after drop duplicates"); - assertEquals(rowsInZFrameDistinct.size(), matchedRowCount, - "Rows from zFrame and sample data doesn't match after drop duplicates"); + + assertEquals(rows.size(), matchedCount, + "rows count is not as expected"); + assertEquals(sampleDataWithDistinctSurnameAndPostCode.size(), matchedCount, + "rows count is not as expected"); } @Test @@ -488,11 +527,29 @@ public void testSortDescending() throws Exception { String col = STR_RECID; ZFrame zFrameSortedDesc = zFrame.sortDescending(col); - List pojoList = (List) zFrameSortedDesc.collectAsList(); - - for(int idx = 0; idx < sampleData.size(); idx++) { - Assertions.assertArrayEquals(PojoToArrayConverter.getObjectArray(sampleData.get(idx)), pojoList.get(idx).values(), - "Row from descending sorted sample data is not equal to row from descending sorted zFrame"); + List rows = zFrameSortedDesc.collectAsList(); + + List fields = List.of(SchemaWithMixedDataType.class.getDeclaredFields()); + for (int idx = 0; idx < sampleData.size(); idx++) { + R row = rows.get(idx); + for (Field column : fields) { + String columnName = column.getName(); + if (column.getType() == String.class) { + assertEquals(column.get(sampleData.get(idx)), zFrameSortedDesc.getAsString(row, columnName), + "value in ZFrame and sample input is not same"); + } else if (column.getType() == Integer.class) { + assertEquals(column.get(sampleData.get(idx)), zFrameSortedDesc.getAsInt(row, columnName), + "value in ZFrame and sample input is not same"); + } else if (column.getType() == Double.class) { + assertEquals(column.get(sampleData.get(idx)), zFrameSortedDesc.getAsDouble(row, columnName), + "value in ZFrame and sample input is not same"); + } else if (column.getType() == Long.class) { + assertEquals(column.get(sampleData.get(idx)), zFrameSortedDesc.getAsLong(row, columnName), + "value in ZFrame and sample input is not same"); + } else { + throw new ZinggException("Not a valid data type"); + } + } } } @@ -504,11 +561,29 @@ public void testSortAscending() throws Exception { String col = STR_RECID; ZFrame zFrameSortedAsc = zFrame.sortAscending(col); - List pojoList = (List) zFrameSortedAsc.collectAsList(); - - for(int idx = 0; idx < sampleData.size(); idx++) { - Assertions.assertArrayEquals(PojoToArrayConverter.getObjectArray(sampleData.get(idx)), pojoList.get(idx).values(), - "Row from ascending sorted sample data is not equal to row from ascending sorted zFrame"); + List rows = zFrameSortedAsc.collectAsList(); + + List fields = List.of(SchemaWithMixedDataType.class.getDeclaredFields()); + for (int idx = 0; idx < sampleData.size(); idx++) { + R row = rows.get(idx); + for (Field column : fields) { + String columnName = column.getName(); + if (column.getType() == String.class) { + assertEquals(column.get(sampleData.get(idx)).toString(), zFrame.getAsString(row, columnName), + "value in ZFrame and sample input is not same"); + } else if (column.getType() == Integer.class) { + assertEquals(column.get(sampleData.get(idx)), zFrame.getAsInt(row, columnName), + "value in ZFrame and sample input is not same"); + } else if (column.getType() == Double.class) { + assertEquals(column.get(sampleData.get(idx)), zFrame.getAsDouble(row, columnName), + "value in ZFrame and sample input is not same"); + } else if (column.getType() == Long.class) { + assertEquals(column.get(sampleData.get(idx)), zFrame.getAsLong(row, columnName), + "value in ZFrame and sample input is not same"); + } else { + throw new ZinggException("Not a valid data type"); + } + } } } @@ -526,11 +601,16 @@ public void testDistinct() throws Exception { List sampleDataDistinct = createSampleDataListDistinct(); ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, Schema.class); - List pojoList = (List) zFrame.distinct().collectAsList(); + List rows = zFrame.distinct().collectAsList(); - for(int idx = 0; idx < sampleDataDistinct.size(); idx++) { - Assertions.assertArrayEquals(PojoToArrayConverter.getObjectArray(sampleDataDistinct.get(idx)), pojoList.get(idx).values(), - "Row from sample data is not equal to row from zFrame"); + List fields = List.of(Schema.class.getDeclaredFields()); + for (int idx = 0; idx < sampleDataDistinct.size(); idx++) { + R row = rows.get(idx); + for (Field column : fields) { + String columnName = column.getName(); + assertEquals(column.get(sampleDataDistinct.get(idx)).toString(), zFrame.getAsString(row, columnName), + "value in ZFrame and sample input is not same"); + } } } From 62317d620c535c5754f63da2e1b1e1ba64c391b9 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 12:38:10 +0530 Subject: [PATCH 155/219] moved schema classes to common --- common/client/pom.xml | 16 ++ .../zingg/common/client/data/Constant.java | 149 +++++++++++ .../zingg/common/client/schema/Schema.java | 17 ++ .../common/client/schema/SchemaCluster.java | 15 ++ .../client/schema/SchemaClusterNull.java | 15 ++ .../common/client/schema/SchemaInput.java | 13 + .../schema/SchemaWithMixedDataType.java | 17 ++ .../common/client/schema/SchemaZScore.java | 13 + .../client/util/TestStringRedactor.java | 6 - spark/client/pom.xml | 8 + .../java/zingg/client/TestSparkFrame.java | 182 ++++++------- .../java/zingg/client/TestZFrameBase.java | 239 ++---------------- 12 files changed, 373 insertions(+), 317 deletions(-) create mode 100644 common/client/src/test/java/zingg/common/client/data/Constant.java create mode 100644 common/client/src/test/java/zingg/common/client/schema/Schema.java create mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java create mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java create mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaInput.java create mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaWithMixedDataType.java create mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java diff --git a/common/client/pom.xml b/common/client/pom.xml index c67339949..8e55122e5 100644 --- a/common/client/pom.xml +++ b/common/client/pom.xml @@ -14,4 +14,20 @@ 1.4 + + + + org.apache.maven.plugins + maven-jar-plugin + ${maven-jar-plugin.version} + + + + test-jar + + + + + + diff --git a/common/client/src/test/java/zingg/common/client/data/Constant.java b/common/client/src/test/java/zingg/common/client/data/Constant.java new file mode 100644 index 000000000..afe9b113c --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/data/Constant.java @@ -0,0 +1,149 @@ +package zingg.common.client.data; + + +import zingg.common.client.schema.Schema; +import zingg.common.client.schema.SchemaCluster; +import zingg.common.client.schema.SchemaClusterNull; +import zingg.common.client.schema.SchemaInput; +import zingg.common.client.schema.SchemaWithMixedDataType; +import zingg.common.client.schema.SchemaZScore; + +import java.util.ArrayList; +import java.util.List; + +public class Constant { + + //sample data classes to be used for testing + public static List createEmptySampleData() { + + return new ArrayList<>(); + } + + public static List createSampleDataList() { + List sample = new ArrayList(); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + + return sample; + } + + public static List createSampleDataListDistinct() { + List sample = new ArrayList(); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + + return sample; + } + + public static List createSampleDataListWithDistinctSurnameAndPostcode() { + List sample = new ArrayList(); + sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + + return sample; + } + + public static List createSampleDataListWithMixedDataType() { + List sample = new ArrayList(); + sample.add(new SchemaWithMixedDataType(7317257, "erjc", "henson", 10.021, 2873)); + sample.add(new SchemaWithMixedDataType(3102490, "jhon", "kozak", 3.2434, 28792)); + sample.add(new SchemaWithMixedDataType(2890805, "david", "pisczek", 5436.0232, 27717)); + sample.add(new SchemaWithMixedDataType(4437063, "e5in", "bbrown", 67.0, 27858)); + sample.add(new SchemaWithMixedDataType(3211564, "susan", "jones", 7343.2324, 2747)); + + sample.add(new SchemaWithMixedDataType(4155808, "jerome", "wilkins", 50.34, 2780)); + sample.add(new SchemaWithMixedDataType(5723231, "clarinw", "pastoreus", 87.2323, 27909)); + sample.add(new SchemaWithMixedDataType(6087743, "william", "craven", 834.123, 27405)); + sample.add(new SchemaWithMixedDataType(538491, "marh", "jackdon", 123.123, 27406)); + sample.add(new SchemaWithMixedDataType(1306702, "vonnell", "palmer", 83.123, 2734)); + + return sample; + } + + public static List createSampleDataZScore() { + + List sample = new ArrayList<>(); + sample.add(new SchemaZScore(0, 100, 900)); + sample.add(new SchemaZScore(1, 100, 1001)); + sample.add(new SchemaZScore(1, 100, 1002)); + sample.add(new SchemaZScore(1, 100, 2001)); + sample.add(new SchemaZScore(1, 100, 2002)); + sample.add(new SchemaZScore(11, 100, 9002)); + sample.add(new SchemaZScore(3, 300, 3001)); + sample.add(new SchemaZScore(3, 300, 3002)); + sample.add(new SchemaZScore(3, 400, 4001)); + sample.add(new SchemaZScore(4, 400, 4002)); + + return sample; + } + + public static List createSampleDataCluster() { + + List sample = new ArrayList<>(); + sample.add(new SchemaCluster(1, 100, 1001, "b")); + sample.add(new SchemaCluster(2, 100, 1002, "a")); + sample.add(new SchemaCluster(3, 100, 2001, "b")); + sample.add(new SchemaCluster(4, 900, 2002, "c")); + sample.add(new SchemaCluster(5, 111, 9002, "c")); + + return sample; + } + + public static List createSampleDataClusterWithNull() { + + List sample = new ArrayList<>(); + sample.add(new SchemaClusterNull(1, 100, 1001, "b")); + sample.add(new SchemaClusterNull(2, 100, 1002, "a")); + sample.add(new SchemaClusterNull(3, 100, 2001, null)); + sample.add(new SchemaClusterNull(4, 900, 2002, "c")); + sample.add(new SchemaClusterNull(5, 111, 9002, null)); + + return sample; + } + + public static List createSampleDataInput() { + + List sample = new ArrayList<>(); + sample.add(new SchemaInput(1, "fname1", "b")); + sample.add(new SchemaInput(2, "fname", "a")); + sample.add(new SchemaInput(3, "fna", "b")); + sample.add((new SchemaInput(4, "x", "c"))); + sample.add(new SchemaInput(5, "y", "c")); + sample.add(new SchemaInput(11, "new1", "b")); + sample.add(new SchemaInput(22, "new12", "a")); + sample.add(new SchemaInput(33, "new13", "b")); + sample.add(new SchemaInput(44, "new14", "c")); + sample.add(new SchemaInput(55, "new15", "c")); + + return sample; + } + +} diff --git a/common/client/src/test/java/zingg/common/client/schema/Schema.java b/common/client/src/test/java/zingg/common/client/schema/Schema.java new file mode 100644 index 000000000..95c7a78e1 --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/schema/Schema.java @@ -0,0 +1,17 @@ +package zingg.common.client.schema; + +public class Schema { + public final String recid; + public final String givenname; + public final String surname; + public final String suburb; + public final String postcode; + + public Schema(String recid, String givename, String surname, String suburb, String postcode) { + this.recid = recid; + this.givenname = givename; + this.surname = surname; + this.suburb = suburb; + this.postcode = postcode; + } +} diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java b/common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java new file mode 100644 index 000000000..f720118ba --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java @@ -0,0 +1,15 @@ +package zingg.common.client.schema; + +public class SchemaCluster { + public final Integer z_zid; + public final Integer z_cluster; + public final Integer z_score; + public final String z_zsource; + + public SchemaCluster(Integer z_zid, Integer z_cluster, Integer z_score, String z_zsource) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + this.z_zsource = z_zsource; + } +} diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java b/common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java new file mode 100644 index 000000000..0f8847498 --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java @@ -0,0 +1,15 @@ +package zingg.common.client.schema; + +public class SchemaClusterNull { + public final Integer z_z_zid; + public final Integer z_cluster; + public final Integer z_score; + public final String z_zsource; + + public SchemaClusterNull(Integer z_z_zid, Integer z_cluster, Integer z_score, String z_zsource) { + this.z_z_zid = z_z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + this.z_zsource = z_zsource; + } +} \ No newline at end of file diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaInput.java b/common/client/src/test/java/zingg/common/client/schema/SchemaInput.java new file mode 100644 index 000000000..c1569cd9b --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/schema/SchemaInput.java @@ -0,0 +1,13 @@ +package zingg.common.client.schema; + +public class SchemaInput { + public final Integer z_zid; + public final String fname; + public final String z_zsource; + + public SchemaInput(Integer z_zid, String fname, String z_zsource) { + this.z_zid = z_zid; + this.fname = fname; + this.z_zsource = z_zsource; + } +} \ No newline at end of file diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaWithMixedDataType.java b/common/client/src/test/java/zingg/common/client/schema/SchemaWithMixedDataType.java new file mode 100644 index 000000000..7878df8c2 --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/schema/SchemaWithMixedDataType.java @@ -0,0 +1,17 @@ +package zingg.common.client.schema; + +public class SchemaWithMixedDataType { + public final Integer recid; + public final String givenname; + public final String surname; + public final Double cost; + public final Integer postcode; + + public SchemaWithMixedDataType(Integer recid, String givename, String surname, Double cost, Integer postcode) { + this.recid = recid; + this.givenname = givename; + this.surname = surname; + this.cost = cost; + this.postcode = postcode; + } +} \ No newline at end of file diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java b/common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java new file mode 100644 index 000000000..83a443efa --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java @@ -0,0 +1,13 @@ +package zingg.common.client.schema; + +public class SchemaZScore { + public final Integer z_zid; + public final Integer z_cluster; + public final Integer z_score; + + public SchemaZScore(Integer z_zid, Integer z_cluster, Integer z_score) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + } +} diff --git a/common/client/src/test/java/zingg/common/client/util/TestStringRedactor.java b/common/client/src/test/java/zingg/common/client/util/TestStringRedactor.java index 10220a5f1..07aff4f66 100644 --- a/common/client/src/test/java/zingg/common/client/util/TestStringRedactor.java +++ b/common/client/src/test/java/zingg/common/client/util/TestStringRedactor.java @@ -1,7 +1,5 @@ package zingg.common.client.util; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.fail; import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.stream.Stream; @@ -10,14 +8,10 @@ import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.Arguments; import static org.junit.jupiter.params.provider.Arguments.arguments; -import static org.junit.jupiter.api.Named.named; -import java.util.Arrays; import java.util.HashMap; import java.util.Map; -import org.junit.jupiter.api.Test; - public class TestStringRedactor { @ParameterizedTest(name="{0}") diff --git a/spark/client/pom.xml b/spark/client/pom.xml index 28be7db77..a42974de4 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -39,6 +39,14 @@ jackson-annotations ${fasterxml.jackson.version} + + zingg + zingg-common-client + ${zingg.version} + tests + test-jar + test + diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index 9446d5e93..e82acb5fb 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -1,103 +1,111 @@ package zingg.client; -import java.util.Arrays; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.*; -import org.apache.spark.sql.types.*; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; - -import zingg.common.client.Arguments; import zingg.common.client.IArguments; +import zingg.common.client.ZFrame; import zingg.spark.client.SparkFrame; import zingg.spark.client.util.SparkDFObjectUtil; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.assertTrue; + public class TestSparkFrame extends TestZFrameBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkFrame.class); - public static IArguments args; - public static JavaSparkContext ctx; - public static SparkSession spark; - - @BeforeAll - public static void setup() { - setUpSpark(); - } - - protected static void setUpSpark() { - try { - spark = SparkSession - .builder() - .master("local[*]") - .appName("Zingg" + "Junit") - .getOrCreate(); - ctx = new JavaSparkContext(spark.sparkContext()); - JavaSparkContext.jarOfClass(TestZFrameBase.class); - args = new Arguments(); - } catch (Throwable e) { - if (LOG.isDebugEnabled()) - e.printStackTrace(); - LOG.info("Problem in spark env setup"); - } - } - - @AfterAll - public static void teardown() { - if (ctx != null) { - ctx.stop(); - ctx = null; - } - if (spark != null) { - spark.stop(); - spark = null; - } - } - - private SparkSession sparkSession; - - public TestSparkFrame() { - super(new SparkDFObjectUtil(spark)); - } - - @Test - public void testAliasOfSparkFrame() { - SparkFrame sf = new SparkFrame(createSampleDataset()); - String aliasName = "AnotherName"; - sf.as(aliasName); - assertTrueCheckingExceptOutput(sf.as(aliasName), sf, "Dataframe and its alias are not same"); - } - - public Dataset createSampleDataset() { - - if (spark==null) { - setUpSpark(); - } - - StructType schemaOfSample = new StructType(new StructField[] { - new StructField("recid", DataTypes.StringType, false, Metadata.empty()), - new StructField("givenname", DataTypes.StringType, false, Metadata.empty()), - new StructField("surname", DataTypes.StringType, false, Metadata.empty()), - new StructField("suburb", DataTypes.StringType, false, Metadata.empty()), - new StructField("postcode", DataTypes.StringType, false, Metadata.empty()) - }); - - Dataset sample = spark.createDataFrame(Arrays.asList( - RowFactory.create("07317257", "erjc", "henson", "hendersonville", "2873g"), - RowFactory.create("03102490", "jhon", "kozak", "henders0nville", "28792"), - RowFactory.create("02890805", "david", "pisczek", "durham", "27717"), - RowFactory.create("04437063", "e5in", "bbrown", "greenville", "27858"), - RowFactory.create("03211564", "susan", "jones", "greenjboro", "274o7"), - RowFactory.create("04155808", "jerome", "wilkins", "battleborn", "2780g"), - RowFactory.create("05723231", "clarinw", "pastoreus", "elizabeth city", "27909"), - RowFactory.create("06087743", "william", "craven", "greenshoro", "27405"), - RowFactory.create("00538491", "marh", "jackdon", "greensboro", "27406"), - RowFactory.create("01306702", "vonnell", "palmer", "siler sity", "273q4")), schemaOfSample); - - return sample; - } + public static final Log LOG = LogFactory.getLog(TestSparkFrame.class); + public static IArguments args; + public static JavaSparkContext ctx; + public static SparkSession spark; + private SparkSession sparkSession; + + public TestSparkFrame() { + super(new SparkDFObjectUtil(spark)); + } + + @BeforeAll + public static void setup() { + setUpSpark(); + } + + protected static void setUpSpark() { + try { + spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + ctx = new JavaSparkContext(spark.sparkContext()); + } catch (Throwable e) { + if (LOG.isDebugEnabled()) + e.printStackTrace(); + LOG.info("Problem in spark env setup"); + } + } + + @AfterAll + public static void teardown() { + if (ctx != null) { + ctx.stop(); + ctx = null; + } + if (spark != null) { + spark.stop(); + spark = null; + } + } + + @Test + public void testAliasOfSparkFrame() { + SparkFrame sf = new SparkFrame(createSampleDataset()); + String aliasName = "AnotherName"; + sf.as(aliasName); + assertTrueCheckingExceptOutput(sf.as(aliasName), sf, "Dataframe and its alias are not same"); + } + + public Dataset createSampleDataset() { + + if (spark == null) { + setUpSpark(); + } + + StructType schemaOfSample = new StructType(new StructField[]{ + new StructField("recid", DataTypes.StringType, false, Metadata.empty()), + new StructField("givenname", DataTypes.StringType, false, Metadata.empty()), + new StructField("surname", DataTypes.StringType, false, Metadata.empty()), + new StructField("suburb", DataTypes.StringType, false, Metadata.empty()), + new StructField("postcode", DataTypes.StringType, false, Metadata.empty()) + }); + + return spark.createDataFrame(Arrays.asList( + RowFactory.create("07317257", "erjc", "henson", "hendersonville", "2873g"), + RowFactory.create("03102490", "jhon", "kozak", "henders0nville", "28792"), + RowFactory.create("02890805", "david", "pisczek", "durham", "27717"), + RowFactory.create("04437063", "e5in", "bbrown", "greenville", "27858"), + RowFactory.create("03211564", "susan", "jones", "greenjboro", "274o7"), + RowFactory.create("04155808", "jerome", "wilkins", "battleborn", "2780g"), + RowFactory.create("05723231", "clarinw", "pastoreus", "elizabeth city", "27909"), + RowFactory.create("06087743", "william", "craven", "greenshoro", "27405"), + RowFactory.create("00538491", "marh", "jackdon", "greensboro", "27406"), + RowFactory.create("01306702", "vonnell", "palmer", "siler sity", "273q4")), schemaOfSample); + } + + protected void assertTrueCheckingExceptOutput(ZFrame, Row, Column> sf1, ZFrame, Row, Column> sf2, String message) { + assertTrue(sf1.except(sf2).isEmpty(), message); + } } \ No newline at end of file diff --git a/spark/client/src/test/java/zingg/client/TestZFrameBase.java b/spark/client/src/test/java/zingg/client/TestZFrameBase.java index c0d24877b..dcb59dac2 100644 --- a/spark/client/src/test/java/zingg/client/TestZFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestZFrameBase.java @@ -9,6 +9,12 @@ import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.core.ZinggException; +import zingg.common.client.schema.Schema; +import zingg.common.client.schema.SchemaWithMixedDataType; +import zingg.common.client.schema.SchemaZScore; +import zingg.common.client.schema.SchemaInput; +import zingg.common.client.schema.SchemaCluster; +import zingg.common.client.schema.SchemaClusterNull; import java.lang.reflect.Field; import java.util.ArrayList; @@ -18,6 +24,15 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; +import static zingg.common.client.data.Constant.createEmptySampleData; +import static zingg.common.client.data.Constant.createSampleDataCluster; +import static zingg.common.client.data.Constant.createSampleDataClusterWithNull; +import static zingg.common.client.data.Constant.createSampleDataInput; +import static zingg.common.client.data.Constant.createSampleDataList; +import static zingg.common.client.data.Constant.createSampleDataListDistinct; +import static zingg.common.client.data.Constant.createSampleDataListWithDistinctSurnameAndPostcode; +import static zingg.common.client.data.Constant.createSampleDataListWithMixedDataType; +import static zingg.common.client.data.Constant.createSampleDataZScore; public abstract class TestZFrameBase { @@ -613,228 +628,4 @@ public void testDistinct() throws Exception { } } } - - //sample data to be used for testing - - public static List createEmptySampleData() { - - return new ArrayList<>(); - } - - public static List createSampleDataList() { - List sample = new ArrayList(); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); - sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); - sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); - sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); - - sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); - sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); - sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); - sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); - sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); - - return sample; - } - - public static List createSampleDataListDistinct() { - List sample = new ArrayList(); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); - sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); - sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); - sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); - - sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); - sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); - sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); - sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); - sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); - - return sample; - } - - public static List createSampleDataListWithDistinctSurnameAndPostcode() { - List sample = new ArrayList(); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); - sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); - sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); - sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); - - sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); - sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); - sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); - sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); - sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); - - return sample; - } - - public static List createSampleDataListWithMixedDataType() { - List sample = new ArrayList(); - sample.add(new SchemaWithMixedDataType(7317257, "erjc", "henson", 10.021, 2873)); - sample.add(new SchemaWithMixedDataType(3102490, "jhon", "kozak", 3.2434, 28792)); - sample.add(new SchemaWithMixedDataType(2890805, "david", "pisczek", 5436.0232, 27717)); - sample.add(new SchemaWithMixedDataType(4437063, "e5in", "bbrown", 67.0, 27858)); - sample.add(new SchemaWithMixedDataType(3211564, "susan", "jones", 7343.2324, 2747)); - - sample.add(new SchemaWithMixedDataType(4155808, "jerome", "wilkins", 50.34, 2780)); - sample.add(new SchemaWithMixedDataType(5723231, "clarinw", "pastoreus", 87.2323, 27909)); - sample.add(new SchemaWithMixedDataType(6087743, "william", "craven", 834.123, 27405)); - sample.add(new SchemaWithMixedDataType(538491, "marh", "jackdon", 123.123, 27406)); - sample.add(new SchemaWithMixedDataType(1306702, "vonnell", "palmer", 83.123, 2734)); - - return sample; - } - - public static List createSampleDataZScore() { - - List sample = new ArrayList<>(); - sample.add(new SchemaZScore(0, 100, 900)); - sample.add(new SchemaZScore(1, 100, 1001)); - sample.add(new SchemaZScore(1, 100, 1002)); - sample.add(new SchemaZScore(1, 100, 2001)); - sample.add(new SchemaZScore(1, 100, 2002)); - sample.add(new SchemaZScore(11, 100, 9002)); - sample.add(new SchemaZScore(3, 300, 3001)); - sample.add(new SchemaZScore(3, 300, 3002)); - sample.add(new SchemaZScore(3, 400, 4001)); - sample.add(new SchemaZScore(4, 400, 4002)); - - return sample; - } - - public static List createSampleDataCluster() { - - List sample = new ArrayList<>(); - sample.add(new SchemaCluster(1, 100, 1001, "b")); - sample.add(new SchemaCluster(2, 100, 1002, "a")); - sample.add(new SchemaCluster(3, 100, 2001, "b")); - sample.add(new SchemaCluster(4, 900, 2002, "c")); - sample.add(new SchemaCluster(5, 111, 9002, "c")); - - return sample; - } - - public static List createSampleDataClusterWithNull() { - - List sample = new ArrayList<>(); - sample.add(new SchemaClusterNull(1, 100, 1001, "b")); - sample.add(new SchemaClusterNull(2, 100, 1002, "a")); - sample.add(new SchemaClusterNull(3, 100, 2001, null)); - sample.add(new SchemaClusterNull(4, 900, 2002, "c")); - sample.add(new SchemaClusterNull(5, 111, 9002, null)); - - return sample; - } - - public static List createSampleDataInput() { - - List sample = new ArrayList<>(); - sample.add(new SchemaInput(1, "fname1", "b")); - sample.add(new SchemaInput(2, "fname", "a")); - sample.add(new SchemaInput(3, "fna", "b")); - sample.add((new SchemaInput(4, "x", "c"))); - sample.add(new SchemaInput(5, "y", "c")); - sample.add(new SchemaInput(11, "new1", "b")); - sample.add(new SchemaInput(22, "new12", "a")); - sample.add(new SchemaInput(33, "new13", "b")); - sample.add(new SchemaInput(44, "new14", "c")); - sample.add(new SchemaInput(55, "new15", "c")); - - return sample; - } - - protected void assertTrueCheckingExceptOutput(ZFrame sf1, ZFrame sf2, String message) { - assertTrue(sf1.except(sf2).isEmpty(), message); - } - - //POJO classes for defining schema - - public static class Schema { - public final String recid; - public final String givenname; - public final String surname; - public final String suburb; - public final String postcode; - - public Schema(String recid, String givename, String surname, String suburb, String postcode) { - this.recid = recid; - this.givenname = givename; - this.surname = surname; - this.suburb = suburb; - this.postcode = postcode; - } - } - - public static class SchemaWithMixedDataType { - public final Integer recid; - public final String givenname; - public final String surname; - public final Double cost; - public final Integer postcode; - - public SchemaWithMixedDataType(Integer recid, String givename, String surname, Double cost, Integer postcode) { - this.recid = recid; - this.givenname = givename; - this.surname = surname; - this.cost = cost; - this.postcode = postcode; - } - } - - public static class SchemaZScore { - public final Integer z_zid; - public final Integer z_cluster; - public final Integer z_score; - - public SchemaZScore(Integer z_zid, Integer z_cluster, Integer z_score) { - this.z_zid = z_zid; - this.z_cluster = z_cluster; - this.z_score = z_score; - } - } - - public static class SchemaClusterNull { - public final Integer z_z_zid; - public final Integer z_cluster; - public final Integer z_score; - public final String z_zsource; - - public SchemaClusterNull(Integer z_z_zid, Integer z_cluster, Integer z_score, String z_zsource) { - this.z_z_zid = z_z_zid; - this.z_cluster = z_cluster; - this.z_score = z_score; - this.z_zsource = z_zsource; - } - } - - public static class SchemaCluster { - public final Integer z_zid; - public final Integer z_cluster; - public final Integer z_score; - public final String z_zsource; - - public SchemaCluster(Integer z_zid, Integer z_cluster, Integer z_score, String z_zsource) { - this.z_zid = z_zid; - this.z_cluster = z_cluster; - this.z_score = z_score; - this.z_zsource = z_zsource; - } - } - - public static class SchemaInput { - public final Integer z_zid; - public final String fname; - public final String z_zsource; - - public SchemaInput(Integer z_zid, String fname, String z_zsource) { - this.z_zid = z_zid; - this.fname = fname; - this.z_zsource = z_zsource; - } - } } \ No newline at end of file From 50800c59babdb624b64de372de6a2b5f24891575 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 13:43:19 +0530 Subject: [PATCH 156/219] renamed model classes --- .../zingg/common/client/data/Constant.java | 192 ++++++++--------- .../common/client/model/ClusterPairOne.java | 15 ++ .../common/client/model/ClusterPairTwo.java | 15 ++ .../ClusterSource.java} | 8 +- .../common/client/model/ClusterZScore.java | 13 ++ .../{schema/Schema.java => model/Person.java} | 6 +- .../PersonMixed.java} | 6 +- .../common/client/schema/SchemaCluster.java | 15 -- .../client/schema/SchemaClusterNull.java | 15 -- .../common/client/schema/SchemaZScore.java | 13 -- .../java/zingg/client/TestZFrameBase.java | 200 +++++++++--------- 11 files changed, 249 insertions(+), 249 deletions(-) create mode 100644 common/client/src/test/java/zingg/common/client/model/ClusterPairOne.java create mode 100644 common/client/src/test/java/zingg/common/client/model/ClusterPairTwo.java rename common/client/src/test/java/zingg/common/client/{schema/SchemaInput.java => model/ClusterSource.java} (50%) create mode 100644 common/client/src/test/java/zingg/common/client/model/ClusterZScore.java rename common/client/src/test/java/zingg/common/client/{schema/Schema.java => model/Person.java} (75%) rename common/client/src/test/java/zingg/common/client/{schema/SchemaWithMixedDataType.java => model/PersonMixed.java} (63%) delete mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java delete mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java delete mode 100644 common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java diff --git a/common/client/src/test/java/zingg/common/client/data/Constant.java b/common/client/src/test/java/zingg/common/client/data/Constant.java index afe9b113c..e84f6c0fb 100644 --- a/common/client/src/test/java/zingg/common/client/data/Constant.java +++ b/common/client/src/test/java/zingg/common/client/data/Constant.java @@ -1,12 +1,12 @@ package zingg.common.client.data; -import zingg.common.client.schema.Schema; -import zingg.common.client.schema.SchemaCluster; -import zingg.common.client.schema.SchemaClusterNull; -import zingg.common.client.schema.SchemaInput; -import zingg.common.client.schema.SchemaWithMixedDataType; -import zingg.common.client.schema.SchemaZScore; +import zingg.common.client.model.Person; +import zingg.common.client.model.ClusterPairOne; +import zingg.common.client.model.ClusterPairTwo; +import zingg.common.client.model.ClusterSource; +import zingg.common.client.model.PersonMixed; +import zingg.common.client.model.ClusterZScore; import java.util.ArrayList; import java.util.List; @@ -14,134 +14,134 @@ public class Constant { //sample data classes to be used for testing - public static List createEmptySampleData() { + public static List createEmptySampleData() { return new ArrayList<>(); } - public static List createSampleDataList() { - List sample = new ArrayList(); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); - sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); - sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); - sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); - - sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); - sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); - sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); - sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); - sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + public static List createSampleDataList() { + List sample = new ArrayList(); + sample.add(new Person("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Person("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Person("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Person("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Person("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Person("03211564", "susan", "jones", "greenjboro", "274o7")); + + sample.add(new Person("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Person("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Person("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Person("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Person("01306702", "vonnell", "palmer", "siler sity", "273q4")); return sample; } - public static List createSampleDataListDistinct() { - List sample = new ArrayList(); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); - sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); - sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); - sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + public static List createSampleDataListDistinct() { + List sample = new ArrayList(); + sample.add(new Person("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Person("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Person("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Person("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Person("03211564", "susan", "jones", "greenjboro", "274o7")); - sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); - sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); - sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); - sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); - sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + sample.add(new Person("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Person("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Person("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Person("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Person("01306702", "vonnell", "palmer", "siler sity", "273q4")); return sample; } - public static List createSampleDataListWithDistinctSurnameAndPostcode() { - List sample = new ArrayList(); - sample.add(new Schema("07317257", "erjc", "henson", "hendersonville", "2873g")); - sample.add(new Schema("03102490", "jhon", "kozak", "henders0nville", "28792")); - sample.add(new Schema("02890805", "david", "pisczek", "durham", "27717")); - sample.add(new Schema("04437063", "e5in", "bbrown", "greenville", "27858")); - sample.add(new Schema("03211564", "susan", "jones", "greenjboro", "274o7")); + public static List createSampleDataListWithDistinctSurnameAndPostcode() { + List sample = new ArrayList(); + sample.add(new Person("07317257", "erjc", "henson", "hendersonville", "2873g")); + sample.add(new Person("03102490", "jhon", "kozak", "henders0nville", "28792")); + sample.add(new Person("02890805", "david", "pisczek", "durham", "27717")); + sample.add(new Person("04437063", "e5in", "bbrown", "greenville", "27858")); + sample.add(new Person("03211564", "susan", "jones", "greenjboro", "274o7")); - sample.add(new Schema("04155808", "jerome", "wilkins", "battleborn", "2780g")); - sample.add(new Schema("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); - sample.add(new Schema("06087743", "william", "craven", "greenshoro", "27405")); - sample.add(new Schema("00538491", "marh", "jackdon", "greensboro", "27406")); - sample.add(new Schema("01306702", "vonnell", "palmer", "siler sity", "273q4")); + sample.add(new Person("04155808", "jerome", "wilkins", "battleborn", "2780g")); + sample.add(new Person("05723231", "clarinw", "pastoreus", "elizabeth city", "27909")); + sample.add(new Person("06087743", "william", "craven", "greenshoro", "27405")); + sample.add(new Person("00538491", "marh", "jackdon", "greensboro", "27406")); + sample.add(new Person("01306702", "vonnell", "palmer", "siler sity", "273q4")); return sample; } - public static List createSampleDataListWithMixedDataType() { - List sample = new ArrayList(); - sample.add(new SchemaWithMixedDataType(7317257, "erjc", "henson", 10.021, 2873)); - sample.add(new SchemaWithMixedDataType(3102490, "jhon", "kozak", 3.2434, 28792)); - sample.add(new SchemaWithMixedDataType(2890805, "david", "pisczek", 5436.0232, 27717)); - sample.add(new SchemaWithMixedDataType(4437063, "e5in", "bbrown", 67.0, 27858)); - sample.add(new SchemaWithMixedDataType(3211564, "susan", "jones", 7343.2324, 2747)); + public static List createSampleDataListWithMixedDataType() { + List sample = new ArrayList(); + sample.add(new PersonMixed(7317257, "erjc", "henson", 10.021, 2873)); + sample.add(new PersonMixed(3102490, "jhon", "kozak", 3.2434, 28792)); + sample.add(new PersonMixed(2890805, "david", "pisczek", 5436.0232, 27717)); + sample.add(new PersonMixed(4437063, "e5in", "bbrown", 67.0, 27858)); + sample.add(new PersonMixed(3211564, "susan", "jones", 7343.2324, 2747)); - sample.add(new SchemaWithMixedDataType(4155808, "jerome", "wilkins", 50.34, 2780)); - sample.add(new SchemaWithMixedDataType(5723231, "clarinw", "pastoreus", 87.2323, 27909)); - sample.add(new SchemaWithMixedDataType(6087743, "william", "craven", 834.123, 27405)); - sample.add(new SchemaWithMixedDataType(538491, "marh", "jackdon", 123.123, 27406)); - sample.add(new SchemaWithMixedDataType(1306702, "vonnell", "palmer", 83.123, 2734)); + sample.add(new PersonMixed(4155808, "jerome", "wilkins", 50.34, 2780)); + sample.add(new PersonMixed(5723231, "clarinw", "pastoreus", 87.2323, 27909)); + sample.add(new PersonMixed(6087743, "william", "craven", 834.123, 27405)); + sample.add(new PersonMixed(538491, "marh", "jackdon", 123.123, 27406)); + sample.add(new PersonMixed(1306702, "vonnell", "palmer", 83.123, 2734)); return sample; } - public static List createSampleDataZScore() { + public static List createSampleDataZScore() { - List sample = new ArrayList<>(); - sample.add(new SchemaZScore(0, 100, 900)); - sample.add(new SchemaZScore(1, 100, 1001)); - sample.add(new SchemaZScore(1, 100, 1002)); - sample.add(new SchemaZScore(1, 100, 2001)); - sample.add(new SchemaZScore(1, 100, 2002)); - sample.add(new SchemaZScore(11, 100, 9002)); - sample.add(new SchemaZScore(3, 300, 3001)); - sample.add(new SchemaZScore(3, 300, 3002)); - sample.add(new SchemaZScore(3, 400, 4001)); - sample.add(new SchemaZScore(4, 400, 4002)); + List sample = new ArrayList<>(); + sample.add(new ClusterZScore(0L, "100", 900.0)); + sample.add(new ClusterZScore(1L, "100", 1001.0)); + sample.add(new ClusterZScore(1L, "100", 1002.0)); + sample.add(new ClusterZScore(1L, "100", 2001.0)); + sample.add(new ClusterZScore(1L, "100", 2002.0)); + sample.add(new ClusterZScore(11L, "100", 9002.0)); + sample.add(new ClusterZScore(3L, "300", 3001.0)); + sample.add(new ClusterZScore(3L, "300", 3002.0)); + sample.add(new ClusterZScore(3L, "400", 4001.0)); + sample.add(new ClusterZScore(4L, "400", 4002.0)); return sample; } - public static List createSampleDataCluster() { + public static List createSampleDataCluster() { - List sample = new ArrayList<>(); - sample.add(new SchemaCluster(1, 100, 1001, "b")); - sample.add(new SchemaCluster(2, 100, 1002, "a")); - sample.add(new SchemaCluster(3, 100, 2001, "b")); - sample.add(new SchemaCluster(4, 900, 2002, "c")); - sample.add(new SchemaCluster(5, 111, 9002, "c")); + List sample = new ArrayList<>(); + sample.add(new ClusterPairOne(1L, "100", 1001.0, "b")); + sample.add(new ClusterPairOne(2L, "100", 1002.0, "a")); + sample.add(new ClusterPairOne(3L, "100", 2001.0, "b")); + sample.add(new ClusterPairOne(4L, "900", 2002.0, "c")); + sample.add(new ClusterPairOne(5L, "111", 9002.0, "c")); return sample; } - public static List createSampleDataClusterWithNull() { + public static List createSampleDataClusterWithNull() { - List sample = new ArrayList<>(); - sample.add(new SchemaClusterNull(1, 100, 1001, "b")); - sample.add(new SchemaClusterNull(2, 100, 1002, "a")); - sample.add(new SchemaClusterNull(3, 100, 2001, null)); - sample.add(new SchemaClusterNull(4, 900, 2002, "c")); - sample.add(new SchemaClusterNull(5, 111, 9002, null)); + List sample = new ArrayList<>(); + sample.add(new ClusterPairTwo(1L, "100", 1001.0, "b")); + sample.add(new ClusterPairTwo(2L, "100", 1002.0, "a")); + sample.add(new ClusterPairTwo(3L, "100", 2001.0, null)); + sample.add(new ClusterPairTwo(4L, "900", 2002.0, "c")); + sample.add(new ClusterPairTwo(5L, "111", 9002.0, null)); return sample; } - public static List createSampleDataInput() { - - List sample = new ArrayList<>(); - sample.add(new SchemaInput(1, "fname1", "b")); - sample.add(new SchemaInput(2, "fname", "a")); - sample.add(new SchemaInput(3, "fna", "b")); - sample.add((new SchemaInput(4, "x", "c"))); - sample.add(new SchemaInput(5, "y", "c")); - sample.add(new SchemaInput(11, "new1", "b")); - sample.add(new SchemaInput(22, "new12", "a")); - sample.add(new SchemaInput(33, "new13", "b")); - sample.add(new SchemaInput(44, "new14", "c")); - sample.add(new SchemaInput(55, "new15", "c")); + public static List createSampleDataInput() { + + List sample = new ArrayList<>(); + sample.add(new ClusterSource(1L, "fname1", "b")); + sample.add(new ClusterSource(2L, "fname", "a")); + sample.add(new ClusterSource(3L, "fna", "b")); + sample.add((new ClusterSource(4L, "x", "c"))); + sample.add(new ClusterSource(5L, "y", "c")); + sample.add(new ClusterSource(11L, "new1", "b")); + sample.add(new ClusterSource(22L, "new12", "a")); + sample.add(new ClusterSource(33L, "new13", "b")); + sample.add(new ClusterSource(44L, "new14", "c")); + sample.add(new ClusterSource(55L, "new15", "c")); return sample; } diff --git a/common/client/src/test/java/zingg/common/client/model/ClusterPairOne.java b/common/client/src/test/java/zingg/common/client/model/ClusterPairOne.java new file mode 100644 index 000000000..6de7c7ff8 --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/model/ClusterPairOne.java @@ -0,0 +1,15 @@ +package zingg.common.client.model; + +public class ClusterPairOne { + public final Long z_zid; + public final String z_cluster; + public final Double z_score; + public final String z_zsource; + + public ClusterPairOne(Long z_zid, String z_cluster, Double z_score, String z_zsource) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + this.z_zsource = z_zsource; + } +} diff --git a/common/client/src/test/java/zingg/common/client/model/ClusterPairTwo.java b/common/client/src/test/java/zingg/common/client/model/ClusterPairTwo.java new file mode 100644 index 000000000..73935df20 --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/model/ClusterPairTwo.java @@ -0,0 +1,15 @@ +package zingg.common.client.model; + +public class ClusterPairTwo { + public final Long z_z_zid; + public final String z_cluster; + public final Double z_score; + public final String z_zsource; + + public ClusterPairTwo(Long z_z_zid, String z_cluster, Double z_score, String z_zsource) { + this.z_z_zid = z_z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + this.z_zsource = z_zsource; + } +} \ No newline at end of file diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaInput.java b/common/client/src/test/java/zingg/common/client/model/ClusterSource.java similarity index 50% rename from common/client/src/test/java/zingg/common/client/schema/SchemaInput.java rename to common/client/src/test/java/zingg/common/client/model/ClusterSource.java index c1569cd9b..e21727258 100644 --- a/common/client/src/test/java/zingg/common/client/schema/SchemaInput.java +++ b/common/client/src/test/java/zingg/common/client/model/ClusterSource.java @@ -1,11 +1,11 @@ -package zingg.common.client.schema; +package zingg.common.client.model; -public class SchemaInput { - public final Integer z_zid; +public class ClusterSource { + public final Long z_zid; public final String fname; public final String z_zsource; - public SchemaInput(Integer z_zid, String fname, String z_zsource) { + public ClusterSource(Long z_zid, String fname, String z_zsource) { this.z_zid = z_zid; this.fname = fname; this.z_zsource = z_zsource; diff --git a/common/client/src/test/java/zingg/common/client/model/ClusterZScore.java b/common/client/src/test/java/zingg/common/client/model/ClusterZScore.java new file mode 100644 index 000000000..83785087f --- /dev/null +++ b/common/client/src/test/java/zingg/common/client/model/ClusterZScore.java @@ -0,0 +1,13 @@ +package zingg.common.client.model; + +public class ClusterZScore { + public final Long z_zid; + public final String z_cluster; + public final Double z_score; + + public ClusterZScore(Long z_zid, String z_cluster, Double z_score) { + this.z_zid = z_zid; + this.z_cluster = z_cluster; + this.z_score = z_score; + } +} diff --git a/common/client/src/test/java/zingg/common/client/schema/Schema.java b/common/client/src/test/java/zingg/common/client/model/Person.java similarity index 75% rename from common/client/src/test/java/zingg/common/client/schema/Schema.java rename to common/client/src/test/java/zingg/common/client/model/Person.java index 95c7a78e1..8c12519fe 100644 --- a/common/client/src/test/java/zingg/common/client/schema/Schema.java +++ b/common/client/src/test/java/zingg/common/client/model/Person.java @@ -1,13 +1,13 @@ -package zingg.common.client.schema; +package zingg.common.client.model; -public class Schema { +public class Person { public final String recid; public final String givenname; public final String surname; public final String suburb; public final String postcode; - public Schema(String recid, String givename, String surname, String suburb, String postcode) { + public Person(String recid, String givename, String surname, String suburb, String postcode) { this.recid = recid; this.givenname = givename; this.surname = surname; diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaWithMixedDataType.java b/common/client/src/test/java/zingg/common/client/model/PersonMixed.java similarity index 63% rename from common/client/src/test/java/zingg/common/client/schema/SchemaWithMixedDataType.java rename to common/client/src/test/java/zingg/common/client/model/PersonMixed.java index 7878df8c2..d432370c2 100644 --- a/common/client/src/test/java/zingg/common/client/schema/SchemaWithMixedDataType.java +++ b/common/client/src/test/java/zingg/common/client/model/PersonMixed.java @@ -1,13 +1,13 @@ -package zingg.common.client.schema; +package zingg.common.client.model; -public class SchemaWithMixedDataType { +public class PersonMixed { public final Integer recid; public final String givenname; public final String surname; public final Double cost; public final Integer postcode; - public SchemaWithMixedDataType(Integer recid, String givename, String surname, Double cost, Integer postcode) { + public PersonMixed(Integer recid, String givename, String surname, Double cost, Integer postcode) { this.recid = recid; this.givenname = givename; this.surname = surname; diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java b/common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java deleted file mode 100644 index f720118ba..000000000 --- a/common/client/src/test/java/zingg/common/client/schema/SchemaCluster.java +++ /dev/null @@ -1,15 +0,0 @@ -package zingg.common.client.schema; - -public class SchemaCluster { - public final Integer z_zid; - public final Integer z_cluster; - public final Integer z_score; - public final String z_zsource; - - public SchemaCluster(Integer z_zid, Integer z_cluster, Integer z_score, String z_zsource) { - this.z_zid = z_zid; - this.z_cluster = z_cluster; - this.z_score = z_score; - this.z_zsource = z_zsource; - } -} diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java b/common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java deleted file mode 100644 index 0f8847498..000000000 --- a/common/client/src/test/java/zingg/common/client/schema/SchemaClusterNull.java +++ /dev/null @@ -1,15 +0,0 @@ -package zingg.common.client.schema; - -public class SchemaClusterNull { - public final Integer z_z_zid; - public final Integer z_cluster; - public final Integer z_score; - public final String z_zsource; - - public SchemaClusterNull(Integer z_z_zid, Integer z_cluster, Integer z_score, String z_zsource) { - this.z_z_zid = z_z_zid; - this.z_cluster = z_cluster; - this.z_score = z_score; - this.z_zsource = z_zsource; - } -} \ No newline at end of file diff --git a/common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java b/common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java deleted file mode 100644 index 83a443efa..000000000 --- a/common/client/src/test/java/zingg/common/client/schema/SchemaZScore.java +++ /dev/null @@ -1,13 +0,0 @@ -package zingg.common.client.schema; - -public class SchemaZScore { - public final Integer z_zid; - public final Integer z_cluster; - public final Integer z_score; - - public SchemaZScore(Integer z_zid, Integer z_cluster, Integer z_score) { - this.z_zid = z_zid; - this.z_cluster = z_cluster; - this.z_score = z_score; - } -} diff --git a/spark/client/src/test/java/zingg/client/TestZFrameBase.java b/spark/client/src/test/java/zingg/client/TestZFrameBase.java index dcb59dac2..96c5cecfa 100644 --- a/spark/client/src/test/java/zingg/client/TestZFrameBase.java +++ b/spark/client/src/test/java/zingg/client/TestZFrameBase.java @@ -9,12 +9,12 @@ import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.core.ZinggException; -import zingg.common.client.schema.Schema; -import zingg.common.client.schema.SchemaWithMixedDataType; -import zingg.common.client.schema.SchemaZScore; -import zingg.common.client.schema.SchemaInput; -import zingg.common.client.schema.SchemaCluster; -import zingg.common.client.schema.SchemaClusterNull; +import zingg.common.client.model.Person; +import zingg.common.client.model.PersonMixed; +import zingg.common.client.model.ClusterZScore; +import zingg.common.client.model.ClusterSource; +import zingg.common.client.model.ClusterPairOne; +import zingg.common.client.model.ClusterPairTwo; import java.lang.reflect.Field; import java.util.ArrayList; @@ -48,13 +48,13 @@ public TestZFrameBase(DFObjectUtil dfObjectUtil) { @Test public void testCreateSparkDataFrameAndGetDF() throws Exception { - List sampleDataSet = createSampleDataList(); + List sampleDataSet = createSampleDataList(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); //assert rows List rows = zFrame.collectAsList(); - List fields = List.of(Schema.class.getDeclaredFields()); + List fields = List.of(Person.class.getDeclaredFields()); for (int idx = 0; idx < sampleDataSet.size(); idx++) { R row = rows.get(idx); for (Field column : fields) { @@ -67,14 +67,14 @@ public void testCreateSparkDataFrameAndGetDF() throws Exception { @Test public void testColumnsNamesandCount() throws Exception { - List sampleDataSet = createSampleDataList(); + List sampleDataSet = createSampleDataList(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); //assert on fields List fieldsInTestData = new ArrayList<>(); List fieldsInZFrame = new ArrayList<>(); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); Arrays.stream(zFrame.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); assertEquals(fieldsInTestData, fieldsInZFrame, "Columns of sample data and zFrame are not equal"); @@ -82,9 +82,9 @@ public void testColumnsNamesandCount() throws Exception { @Test public void testSelectWithSingleColumnName() throws Exception { - List sampleDataSet = createSampleDataList(); //List + List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); String colName = "recid"; List rows = zFrame.select(colName).collectAsList(); for (int idx = 0; idx < sampleDataSet.size(); idx++) { @@ -101,9 +101,9 @@ public void testSelectWithSingleColumnName() throws Exception { @Disabled @Test public void testSelectWithColumnList() throws Exception { - List sampleDataSet = createSampleDataList(); //List + List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); List columnList = (List) Arrays.asList("recid", "surname", "postcode"); List rows = zFrame.select(columnList).collectAsList(); @@ -126,8 +126,8 @@ public void testSelectWithColumnList() throws Exception { @Disabled @Test public void testSelectWithColumnArray() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); C[] columnArray = (C[]) new Object[3]; columnArray[0] = (C) "recid"; @@ -149,8 +149,8 @@ public void testSelectWithColumnArray() throws Exception { @Test public void testSelectWithMultipleColumnNamesAsString() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); List rows = zFrame.select("recid", "surname", "postcode").collectAsList(); @@ -167,8 +167,8 @@ public void testSelectWithMultipleColumnNamesAsString() throws Exception { @Test public void testSelectExprByPassingColumnStringsAsInSQLStatement() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); List rows = zFrame.selectExpr("recid as RecordId", "surname as FamilyName", "postcode as Pin").collectAsList(); @@ -186,13 +186,13 @@ public void testSelectExprByPassingColumnStringsAsInSQLStatement() throws Except @Test public void testDropSingleColumn() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); List fieldsInZFrame = new ArrayList<>(); List fieldsInTestData = new ArrayList<>(); Arrays.stream(zFrame.drop("recid").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.remove("recid"); assertEquals(fieldsInTestData, fieldsInZFrame, "Fields in zFrame and sample data doesn't match"); @@ -200,13 +200,13 @@ public void testDropSingleColumn() throws Exception { @Test public void testDropColumnsAsStringArray() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); List fieldsInZFrame = new ArrayList<>(); List fieldsInTestData = new ArrayList<>(); Arrays.stream(zFrame.drop("recid", "surname", "postcode").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.remove("recid"); fieldsInTestData.remove("surname"); fieldsInTestData.remove("postcode"); @@ -217,15 +217,15 @@ public void testDropColumnsAsStringArray() throws Exception { @Test public void testLimit() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); int len = 5; List rows = zFrame.limit(len).collectAsList(); assertEquals(rows.size(), len, "Size is not equal"); //assert on rows - List fields = List.of(Schema.class.getDeclaredFields()); + List fields = List.of(Person.class.getDeclaredFields()); for (int idx = 0; idx < len; idx++) { R row = rows.get(idx); for (Field column : fields) { @@ -238,11 +238,11 @@ public void testLimit() throws Exception { @Test public void testHead() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); R row = zFrame.head(); - List fields = List.of(Schema.class.getDeclaredFields()); + List fields = List.of(Person.class.getDeclaredFields()); for (Field column : fields) { String columnName = column.getName(); assertEquals(column.get(sampleDataSet.get(0)).toString(), zFrame.getAsString(row, columnName), @@ -252,8 +252,8 @@ public void testHead() throws Exception { @Test public void testGetAsInt() throws Exception { - List sampleDataSet = createSampleDataListWithMixedDataType(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, PersonMixed.class); R row = zFrame.head(); assertTrue(zFrame.getAsInt(row, "recid") == sampleDataSet.get(0).recid, @@ -262,8 +262,8 @@ public void testGetAsInt() throws Exception { @Test public void testGetAsString() throws Exception { - List sampleDataSet = createSampleDataListWithMixedDataType(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, PersonMixed.class); R row = zFrame.head(); assertEquals(zFrame.getAsString(row, "surname"), sampleDataSet.get(0).surname, @@ -272,8 +272,8 @@ public void testGetAsString() throws Exception { @Test public void testGetAsDouble() throws Exception { - List sampleDataSet = createSampleDataListWithMixedDataType(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaWithMixedDataType.class); + List sampleDataSet = createSampleDataListWithMixedDataType(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, PersonMixed.class); R row = zFrame.head(); assertEquals(zFrame.getAsDouble(row, "cost"), sampleDataSet.get(0).cost, @@ -282,8 +282,8 @@ public void testGetAsDouble() throws Exception { @Test public void testWithColumnForIntegerValue() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); String newCol = NEW_COLUMN; int newColVal = 36; @@ -292,7 +292,7 @@ public void testWithColumnForIntegerValue() throws Exception { List fieldsInTestData = new ArrayList<>(); List fieldsInZFrame = new ArrayList<>(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); //Assert on columns @@ -307,8 +307,8 @@ public void testWithColumnForIntegerValue() throws Exception { @Test public void testWithColumnForDoubleValue() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); String newCol = NEW_COLUMN; double newColVal = 3.14; ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); @@ -316,7 +316,7 @@ public void testWithColumnForDoubleValue() throws Exception { List fieldsInTestData = new ArrayList<>(); List fieldsInZFrame = new ArrayList<>(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); //Assert on columns @@ -331,8 +331,8 @@ public void testWithColumnForDoubleValue() throws Exception { @Test public void testWithColumnForStringValue() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); String newCol = NEW_COLUMN; String newColVal = "zingg"; ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); @@ -340,7 +340,7 @@ public void testWithColumnForStringValue() throws Exception { List fieldsInTestData = new ArrayList<>(); List fieldsInZFrame = new ArrayList<>(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); //Assert on columns @@ -355,8 +355,8 @@ public void testWithColumnForStringValue() throws Exception { @Test public void testWithColumnForAnotherColumn() throws Exception { - List sampleDataSet = createSampleDataList(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Schema.class); + List sampleDataSet = createSampleDataList(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); String oldCol = STR_RECID; String newCol = NEW_COLUMN; ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, zFrame.col(oldCol)); @@ -364,7 +364,7 @@ public void testWithColumnForAnotherColumn() throws Exception { List fieldsInTestData = new ArrayList<>(); List fieldsInZFrame = new ArrayList<>(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); - Arrays.stream(Schema.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); + Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); //Assert on columns @@ -379,26 +379,26 @@ public void testWithColumnForAnotherColumn() throws Exception { @Test public void testGetMaxVal() throws Exception { - List sampleDataSet = createSampleDataZScore(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, ClusterZScore.class); - assertEquals(400, zFrame.getMaxVal(ColName.CLUSTER_COLUMN), + assertEquals("400", zFrame.getMaxVal(ColName.CLUSTER_COLUMN), "Max value is not as expected"); } @Test public void testGroupByMinMax() throws Exception { - List sampleDataSet = createSampleDataZScore(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, ClusterZScore.class); ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.ID_COL)); List assertionRows = groupByDF.collectAsList(); for (R row : assertionRows) { - if (groupByDF.getAsInt(row, "z_zid") == 1) { - assertEquals(1001, groupByDF.getAsInt(row, "z_minScore"), + if (groupByDF.getAsLong(row, "z_zid") == 1.0) { + assertEquals(1001.0, groupByDF.getAsDouble(row, "z_minScore"), "z_minScore is not as expected"); - assertEquals(2002, groupByDF.getAsInt(row, "z_maxScore"), + assertEquals(2002.0, groupByDF.getAsDouble(row, "z_maxScore"), "z_maxScore is not as expected"); } } @@ -406,17 +406,17 @@ public void testGroupByMinMax() throws Exception { @Test public void testGroupByMinMax2() throws Exception { - List sampleDataSet = createSampleDataZScore(); //List - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, SchemaZScore.class); + List sampleDataSet = createSampleDataZScore(); //List + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, ClusterZScore.class); ZFrame groupByDF = zFrame.groupByMinMaxScore(zFrame.col(ColName.CLUSTER_COLUMN)); List assertionRows = groupByDF.collectAsList(); for (R row : assertionRows) { - if (groupByDF.getAsInt(row, "z_cluster") == 100) { - assertEquals(900, groupByDF.getAsInt(row, "z_minScore"), + if ("100".equals(groupByDF.getAsString(row, "z_cluster"))) { + assertEquals(900.0, groupByDF.getAsDouble(row, "z_minScore"), "z_minScore is not as expected"); - assertEquals(9002, groupByDF.getAsInt(row, "z_maxScore"), + assertEquals(9002.0, groupByDF.getAsDouble(row, "z_maxScore"), "z_maxScore is not as expected"); } } @@ -424,10 +424,10 @@ public void testGroupByMinMax2() throws Exception { @Test public void testRightJoinMultiCol() throws Exception { - List sampleDataSetInput = createSampleDataInput(); //List - ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); - List sampleDataSetCluster = createSampleDataCluster(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaCluster.class); + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, ClusterSource.class); + List sampleDataSetCluster = createSampleDataCluster(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairOne.class); ZFrame joinedData = zFrameCluster.join(zFrameInput, ColName.ID_COL, ColName.SOURCE_COL, ZFrame.RIGHT_JOIN); assertEquals(10, joinedData.count()); @@ -435,18 +435,18 @@ public void testRightJoinMultiCol() throws Exception { @Test public void testFilterInCond() throws Exception { - List sampleDataSetInput = createSampleDataInput(); //List - ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, SchemaInput.class); - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaClusterNull.class); + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, ClusterSource.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairTwo.class); ZFrame filteredData = zFrameInput.filterInCond(ColName.ID_COL, zFrameCluster, ColName.COL_PREFIX + ColName.ID_COL); assertEquals(5, filteredData.count()); } @Test public void testFilterNotNullCond() throws Exception { - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaClusterNull.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairTwo.class); ZFrame filteredData = zFrameCluster.filterNotNullCond(ColName.SOURCE_COL); assertEquals(3, filteredData.count()); @@ -454,8 +454,8 @@ public void testFilterNotNullCond() throws Exception { @Test public void testFilterNullCond() throws Exception { - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, SchemaClusterNull.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairTwo.class); ZFrame filteredData = zFrameCluster.filterNullCond(ColName.SOURCE_COL); assertEquals(2, filteredData.count()); @@ -463,18 +463,18 @@ public void testFilterNullCond() throws Exception { @Test public void testDropDuplicatesConsideringGivenColumnsAsStringArray() throws Exception { - List sampleData = createSampleDataList(); - List sampleDataWithDistinctSurnameAndPostCode = createSampleDataListWithDistinctSurnameAndPostcode(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, Schema.class); + List sampleData = createSampleDataList(); + List sampleDataWithDistinctSurnameAndPostCode = createSampleDataListWithDistinctSurnameAndPostcode(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, Person.class); String[] columnArray = new String[]{"surname", "postcode"}; ZFrame zFrameDeDuplicated = zFrame.dropDuplicates(columnArray); List rows = zFrameDeDuplicated.collectAsList(); - List fields = List.of(Schema.class.getDeclaredFields()); + List fields = List.of(Person.class.getDeclaredFields()); int matchedCount = 0; - for (Schema schema : sampleDataWithDistinctSurnameAndPostCode) { + for (Person schema : sampleDataWithDistinctSurnameAndPostCode) { for (R row : rows) { boolean rowMatched = true; for (Field column : fields) { @@ -501,20 +501,20 @@ public void testDropDuplicatesConsideringGivenColumnsAsStringArray() throws Exce @Test public void testDropDuplicatesConsideringGivenIndividualColumnsAsString() throws Exception { - List sampleDataSetCluster = createSampleDataList(); - List sampleDataWithDistinctSurnameAndPostCode = createSampleDataListWithDistinctSurnameAndPostcode(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Schema.class); + List sampleDataSetCluster = createSampleDataList(); + List sampleDataWithDistinctSurnameAndPostCode = createSampleDataListWithDistinctSurnameAndPostcode(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, Person.class); ZFrame zFrameDeDuplicated = zFrame.dropDuplicates("surname", "postcode"); List rows = zFrameDeDuplicated.collectAsList(); - List fields = List.of(Schema.class.getDeclaredFields()); + List fields = List.of(Person.class.getDeclaredFields()); int matchedCount = 0; - for (Schema schema : sampleDataWithDistinctSurnameAndPostCode) { + for (Person person : sampleDataWithDistinctSurnameAndPostCode) { for (R row : rows) { boolean rowMatched = true; for (Field column : fields) { String columnName = column.getName(); - if (!column.get(schema).toString(). + if (!column.get(person).toString(). equals(zFrame.getAsString(row, columnName))) { rowMatched = false; break; @@ -536,15 +536,15 @@ public void testDropDuplicatesConsideringGivenIndividualColumnsAsString() throws @Test public void testSortDescending() throws Exception { - List sampleData = createSampleDataListWithMixedDataType(); + List sampleData = createSampleDataListWithMixedDataType(); sampleData.sort((a, b) -> a.recid > b.recid ? -1 : 1); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, PersonMixed.class); String col = STR_RECID; ZFrame zFrameSortedDesc = zFrame.sortDescending(col); List rows = zFrameSortedDesc.collectAsList(); - List fields = List.of(SchemaWithMixedDataType.class.getDeclaredFields()); + List fields = List.of(PersonMixed.class.getDeclaredFields()); for (int idx = 0; idx < sampleData.size(); idx++) { R row = rows.get(idx); for (Field column : fields) { @@ -570,15 +570,15 @@ public void testSortDescending() throws Exception { @Test public void testSortAscending() throws Exception { - List sampleData = createSampleDataListWithMixedDataType(); + List sampleData = createSampleDataListWithMixedDataType(); sampleData.sort((a, b) -> a.recid < b.recid ? -1 : 1); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, SchemaWithMixedDataType.class); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, PersonMixed.class); String col = STR_RECID; ZFrame zFrameSortedAsc = zFrame.sortAscending(col); List rows = zFrameSortedAsc.collectAsList(); - List fields = List.of(SchemaWithMixedDataType.class.getDeclaredFields()); + List fields = List.of(PersonMixed.class.getDeclaredFields()); for (int idx = 0; idx < sampleData.size(); idx++) { R row = rows.get(idx); for (Field column : fields) { @@ -604,21 +604,21 @@ public void testSortAscending() throws Exception { @Test public void testIsEmpty() throws Exception { - List emptySampleData = createEmptySampleData(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(emptySampleData, Schema.class); + List emptySampleData = createEmptySampleData(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(emptySampleData, Person.class); assertTrue(zFrame.isEmpty(), "zFrame is not empty"); } @Test public void testDistinct() throws Exception { - List sampleData = createSampleDataList(); - List sampleDataDistinct = createSampleDataListDistinct(); - ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, Schema.class); + List sampleData = createSampleDataList(); + List sampleDataDistinct = createSampleDataListDistinct(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleData, Person.class); List rows = zFrame.distinct().collectAsList(); - List fields = List.of(Schema.class.getDeclaredFields()); + List fields = List.of(Person.class.getDeclaredFields()); for (int idx = 0; idx < sampleDataDistinct.size(); idx++) { R row = rows.get(idx); for (Field column : fields) { From 01eb4eaa64ab9b5b96eebedec47f592ef86ba356 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 13:46:49 +0530 Subject: [PATCH 157/219] moved TestZFrameBase to common-client --- .../test/java/zingg/common}/client/TestZFrameBase.java | 8 +++----- .../client/src/test/java/zingg/client/TestSparkFrame.java | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) rename {spark/client/src/test/java/zingg => common/client/src/test/java/zingg/common}/client/TestZFrameBase.java (99%) diff --git a/spark/client/src/test/java/zingg/client/TestZFrameBase.java b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java similarity index 99% rename from spark/client/src/test/java/zingg/client/TestZFrameBase.java rename to common/client/src/test/java/zingg/common/client/TestZFrameBase.java index 96c5cecfa..7cdac1a9b 100644 --- a/spark/client/src/test/java/zingg/client/TestZFrameBase.java +++ b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java @@ -1,14 +1,12 @@ -package zingg.client; +package zingg.common.client; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; -import zingg.common.core.ZinggException; import zingg.common.client.model.Person; import zingg.common.client.model.PersonMixed; import zingg.common.client.model.ClusterZScore; @@ -562,7 +560,7 @@ public void testSortDescending() throws Exception { assertEquals(column.get(sampleData.get(idx)), zFrameSortedDesc.getAsLong(row, columnName), "value in ZFrame and sample input is not same"); } else { - throw new ZinggException("Not a valid data type"); + throw new Exception("Not a valid data type"); } } } @@ -596,7 +594,7 @@ public void testSortAscending() throws Exception { assertEquals(column.get(sampleData.get(idx)), zFrame.getAsLong(row, columnName), "value in ZFrame and sample input is not same"); } else { - throw new ZinggException("Not a valid data type"); + throw new Exception("Not a valid data type"); } } } diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index e82acb5fb..a55ab8346 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -18,6 +18,7 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import zingg.common.client.IArguments; +import zingg.common.client.TestZFrameBase; import zingg.common.client.ZFrame; import zingg.spark.client.SparkFrame; import zingg.spark.client.util.SparkDFObjectUtil; From d099b62c870b8cd12fa371369eed2cc84f0d9491 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 14:38:46 +0530 Subject: [PATCH 158/219] added withSession interface --- .../common/client/util/DFObjectUtil.java | 14 -------------- .../zingg/common/client/util/WithSession.java | 9 +++++++++ .../spark/client/util/SparkDFObjectUtil.java | 12 +++++++----- .../spark/client/util/WithSparkSession.java | 19 +++++++++++++++++++ .../java/zingg/client/TestSparkFrame.java | 8 ++++++-- 5 files changed, 41 insertions(+), 21 deletions(-) create mode 100644 common/client/src/main/java/zingg/common/client/util/WithSession.java create mode 100644 spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java diff --git a/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java index e8a190a4d..d277074b7 100644 --- a/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java +++ b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java @@ -6,20 +6,6 @@ public abstract class DFObjectUtil { - S session; - - public DFObjectUtil(S s) { - this.session = s; - } - - public S getSession() { - return this.session; - } - - public void setSession(S session) { - this.session = session; - } - public abstract ZFrame getDFFromObjectList(List objList, Class objClass) throws Exception; } diff --git a/common/client/src/main/java/zingg/common/client/util/WithSession.java b/common/client/src/main/java/zingg/common/client/util/WithSession.java new file mode 100644 index 000000000..7fbbc1780 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/WithSession.java @@ -0,0 +1,9 @@ +package zingg.common.client.util; + +public interface WithSession { + + public void setSession(S s); + + public S getSession(); + +} diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java index 1187fe533..9eb61eec5 100644 --- a/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java @@ -11,24 +11,26 @@ import zingg.common.client.ZFrame; import zingg.common.client.util.DFObjectUtil; +import zingg.common.client.util.WithSession; import zingg.spark.client.SparkFrame; public class SparkDFObjectUtil extends DFObjectUtil, Row, Column> { - public SparkDFObjectUtil(SparkSession s) { - super(s); + private final WithSession withSparkSession; + + public SparkDFObjectUtil(WithSession withSparkSession) { + this.withSparkSession = withSparkSession; } @Override public ZFrame, Row, Column> getDFFromObjectList(List objList, Class objClass) throws Exception { - if(objList==null || objClass==null) return null; + if(objList == null || objClass == null) return null; SparkStructTypeFromPojoClass stpc = new SparkStructTypeFromPojoClass(); List rows = Arrays.asList(RowsFromObjectList.getRows(objList)); StructType structType = stpc.getStructType(objClass); - return new SparkFrame(getSession().createDataFrame(rows, structType)); + return new SparkFrame(withSparkSession.getSession().createDataFrame(rows, structType)); } - } diff --git a/spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java b/spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java new file mode 100644 index 000000000..327eca0d4 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java @@ -0,0 +1,19 @@ +package zingg.spark.client.util; + +import org.apache.spark.sql.SparkSession; +import zingg.common.client.util.WithSession; + +public class WithSparkSession implements WithSession { + + private SparkSession sparkSession; + + @Override + public void setSession(SparkSession sparkSession) { + this.sparkSession = sparkSession; + } + + @Override + public SparkSession getSession() { + return this.sparkSession; + } +} diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index a55ab8346..f2cd6df2e 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -20,8 +20,10 @@ import zingg.common.client.IArguments; import zingg.common.client.TestZFrameBase; import zingg.common.client.ZFrame; +import zingg.common.client.util.WithSession; import zingg.spark.client.SparkFrame; import zingg.spark.client.util.SparkDFObjectUtil; +import zingg.spark.client.util.WithSparkSession; import java.util.Arrays; @@ -32,10 +34,10 @@ public class TestSparkFrame extends TestZFrameBase, R public static IArguments args; public static JavaSparkContext ctx; public static SparkSession spark; - private SparkSession sparkSession; + public static WithSession withSession; public TestSparkFrame() { - super(new SparkDFObjectUtil(spark)); + super(new SparkDFObjectUtil(withSession)); } @BeforeAll @@ -51,6 +53,8 @@ protected static void setUpSpark() { .appName("Zingg" + "Junit") .getOrCreate(); ctx = new JavaSparkContext(spark.sparkContext()); + withSession = new WithSparkSession(); + withSession.setSession(spark); } catch (Throwable e) { if (LOG.isDebugEnabled()) e.printStackTrace(); From 32bb530839a84018b536f7a44e8665f5075d7e8c Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 15:47:23 +0530 Subject: [PATCH 159/219] reverted back jackson-module-scala version updated jackson version in spark-client -> 2.12.0 --- pom.xml | 2 +- spark/client/pom.xml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 76decc5e6..c63f6ffcc 100644 --- a/pom.xml +++ b/pom.xml @@ -146,7 +146,7 @@ com.fasterxml.jackson.module jackson-module-scala_2.12 - 2.17.1 + 2.12.2 diff --git a/spark/client/pom.xml b/spark/client/pom.xml index a42974de4..e5c2146c4 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -8,8 +8,8 @@ zingg-spark-client jar - 2.17.0 - 2.17.0 + 2.12.0 + 2.12.0 From 57f50eb8ba7c4e217d3a639e2893091e7e8e12c3 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 17:04:14 +0530 Subject: [PATCH 160/219] formatted --- docs/settingUpZingg.md | 97 ++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 33 deletions(-) diff --git a/docs/settingUpZingg.md b/docs/settingUpZingg.md index 9f1f3982a..9bfef4453 100644 --- a/docs/settingUpZingg.md +++ b/docs/settingUpZingg.md @@ -8,7 +8,7 @@ sudo apt update **** -_**Step 0 : Install Ubuntu on WSL2 on Windows**_ +**Step 0 : Install Ubuntu on WSL2 on Windows** * Install wsl: Type the following command in **Windows PowerShell**. ``` @@ -24,31 +24,31 @@ sudo apt update **** -_**Step 1 : Clone the Zingg Repository**_ +**Step 1 : Clone the Zingg Repository** * Install and SetUp Git: **sudo apt install git** * Verify : **git --version** * Set up Git by following the [tutorial](https://www.digitalocean.com/community/tutorials/how-to-install-git-on-ubuntu-20-04). * Clone the Zingg Repository: **git clone https://github.com/zinggAI/zingg.git** -_**Note :-**_ It is suggested to fork the repository to your account and then clone the repository. +**Note :-** It is suggested to fork the repository to your account and then clone the repository. **** -_**Step 2 : Install JDK 1.8 (Java Development Kit)**_ +**Step 2 : Install JDK 11 (Java Development Kit)** -* Follow this [tutorial](https://linuxize.com/post/install-java-on-ubuntu-20-04/) to install Java8 JDK1.8 in Ubuntu. +* Follow this [tutorial](https://linuxize.com/post/install-java-on-ubuntu-20-04/) to install Java11 JDK11 in Ubuntu. * For example: ``` -sudo apt install openjdk-8-jdk openjdk-8-jre +sudo apt install openjdk-11-jdk openjdk-11-jre javac -version java -version ``` **** -_**Step 3 : Install Apache Spark -**_ +**Step 3 : Install Apache Spark -** * Download Apache Spark - from the [Apache Spark Official Website](https://spark.apache.org/downloads.html). * Install downloaded Apache Spark - on your Ubuntu by following [this tutorial](https://computingforgeeks.com/how-to-install-apache-spark-on-ubuntu-debian/). @@ -63,11 +63,11 @@ sudo mv spark-3.5.0-bin-hadoop3 /opt/spark Make sure that spark version you have installed is compatible with java you have installed, and Zingg is supporting those versions. -_**Note :-**_ Zingg supports Spark 3.5 and the corresponding Java version. +**Note :-** Zingg supports Spark 3.5 and the corresponding Java version. **** -_**Step 4 : Install Apache Maven**_ +**Step 4 : Install Apache Maven** * Install the latest maven package. @@ -79,66 +79,97 @@ rm -rf apache-maven-3.8.8-bin.tar.gz cd apache-maven-3.8.8/ cd bin ./mvn --version + +Make sure that mvn -version should display correct java version as well(JAVA 11) +Apache Maven 3.8.7 +Maven home: /usr/share/maven +Java version: 11.0.23, vendor: Ubuntu, runtime: /usr/lib/jvm/java-11-openjdk-amd64 ``` **** -_**Step 5 : Update Env Variables**_ +**Step 5 : Update Env Variables** -Open .bashrc and add env variables at end of file +* Open .bashrc and add env variables at end of file ``` vim ~/.bashrc - export SPARK_HOME=/opt/spark export SPARK_MASTER=local[\*] export MAVEN_HOME=/home/ubuntu/apache-maven-3.8.8 -export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin:$MAVEN_HOME/bin -export ZINGG_HOME=/zingg/assembly/target -export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +export ZINGG_HOME=/assembly/target +export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 +export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin:$JAVA_HOME/bin + +``` +\ will be a directory where you clone the repository of the Zingg. Similarly, if you have installed spark on a different directory you can set **SPARK\_HOME** accordingly. -Save/exit and do source .bashrc so that they reflect +**Note :-** Skip exporting MAVEN_HOME if multiple maven version are not required +* Save/exit and do source .bashrc so that they reflect +``` source ~/.bashrc +``` -Verify: +* Verify: +``` echo $PATH mvn --version -``` -where \ will be a directory where you clone the repository of the Zingg. Similarly, if you have installed spark on a different directory you can set **SPARK\_HOME** accordingly. +``` -_**Note :-**_ If you have already set up **JAVA\_HOME** and **SPARK\_HOME** in the steps before you don't need to do this again. +**Note :-** If you have already set up **JAVA\_HOME** and **SPARK\_HOME** in the steps before you don't need to do this again. **** -_**Step 6 : Compile the Zingg Repository**_ +**Step 6 : Compile the Zingg Repository** -* Run the following to Compile the Zingg Repository - +* Ensure you are on main branch ``` git branch -(Ensure you are on main branch) + +``` + +* Run the following to Compile the Zingg Repository +``` mvn initialize -* Run the following to Compile the Zingg Repository - **mvn initialize** and -* **mvn clean compile package -Dspark=sparkVer** +mvn clean compile package -Dspark=sparkVer ``` -_**Note :-**_ Replace the **sparkVer** with the version of spark you installed, For example, **-Dspark=3.5** and if still facing error, include **-Dmaven.test.skip=true** with the above command. +* Run the following to Compile while skipping tests +``` +mvn initialize +mvn clean compile package -Dspark=sparkVer -Dmaven.test.skip=true +``` +**Note :-** Replace the **sparkVer** with the version of spark you installed, For example, **-Dspark=3.5** and if still facing error, exclude tests while compiling. -_**Note :-**_ substitute 3.3 with profile of the spark version you have installed. This is based on profiles specified in pom.xml + +**Note :-** substitute 3.3 with profile of the spark version you have installed. This is based on profiles specified in pom.xml **** -_**Step 7 : If had any issue with 'SPARK\_LOCAL\_IP'**_ +**Step 7 : If had any issue with 'SPARK\_LOCAL\_IP'** + +* Install **net-tools** +``` +sudo apt-get install -y net-tools +``` + +* Run command in the terminal to get IP address +``` +ifconfig +``` -* Install **net-tools** using **sudo apt-get install -y net-tools** -* Run command in the terminal **ifconfig**, find the **IP address** and paste the same in **/opt/hosts** IP address of your Pc-Name +* Paste the IP in **/opt/hosts** IP address of your Pc-Name **** -_**Step 8 : Run Zingg to Find Training Data**_ +**Step 8 : Run Zingg to Find Training Data** -* Run this Script in terminal opened in zingg clones directory - **./scripts/zingg.sh --phase findTrainingData --conf examples/febrl/config.json** +* Run this Script in terminal opened in zingg clones directory - +``` +./scripts/zingg.sh --phase findTrainingData --conf examples/febrl/config.json +``` **** -**If everything is right, it should show Zingg Icon.** +**If everything is right, it should show Zingg banner.** From b6159b3b3b4f11b3c96bcb2943b3c097c3fc90bb Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 19:45:01 +0530 Subject: [PATCH 161/219] generalized block test --- .../test/java/zingg/block/TestBlockBase.java | 101 +++++++ .../src/test/java/zingg/data/Constant.java | 135 +++++++++ .../core/src/test/java/zingg/model/Event.java | 15 + .../test/java/zingg/model/EventCluster.java | 23 ++ .../src/test/java/zingg/block/TestBlock.java | 256 ------------------ .../test/java/zingg/block/TestSparkBlock.java | 69 +++++ 6 files changed, 343 insertions(+), 256 deletions(-) create mode 100644 common/core/src/test/java/zingg/block/TestBlockBase.java create mode 100644 common/core/src/test/java/zingg/data/Constant.java create mode 100644 common/core/src/test/java/zingg/model/Event.java create mode 100644 common/core/src/test/java/zingg/model/EventCluster.java delete mode 100644 spark/core/src/test/java/zingg/block/TestBlock.java create mode 100644 spark/core/src/test/java/zingg/block/TestSparkBlock.java diff --git a/common/core/src/test/java/zingg/block/TestBlockBase.java b/common/core/src/test/java/zingg/block/TestBlockBase.java new file mode 100644 index 000000000..9f1593790 --- /dev/null +++ b/common/core/src/test/java/zingg/block/TestBlockBase.java @@ -0,0 +1,101 @@ +package zingg.block; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import zingg.common.client.ArgumentsUtil; +import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; +import zingg.common.client.MatchType; +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.DFObjectUtil; +import zingg.common.core.block.Canopy; +import zingg.common.core.block.Tree; +import zingg.common.core.util.BlockingTreeUtil; +import zingg.common.core.util.HashUtil; +import zingg.model.Event; +import zingg.model.EventCluster; +import zingg.data.Constant; + +public abstract class TestBlockBase { + + public ArgumentsUtil argumentsUtil = new ArgumentsUtil(); + public final DFObjectUtil dfObjectUtil; + public final HashUtil hashUtil; + public final BlockingTreeUtil blockingTreeUtil; + + public TestBlockBase(DFObjectUtil dfObjectUtil, HashUtil hashUtil, BlockingTreeUtil blockingTreeUtil) { + this.dfObjectUtil = dfObjectUtil; + this.hashUtil = hashUtil; + this.blockingTreeUtil = blockingTreeUtil; + } + + @Test + public void testTree() throws Throwable { + + // form tree + ZFrame zFrameEvent = dfObjectUtil.getDFFromObjectList(Constant.createSampleEventData(), Event.class); + ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(Constant.createSampleClusterEventData(), EventCluster.class); + IArguments args = getArguments(); + + Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(zFrameEvent, zFrameEventCluster, 0.5, -1, + args, hashUtil.getHashFunctionList()); + + // primary deciding is unique year so identityInteger should have been picked + Canopy head = blockingTree.getHead(); + assertEquals("identityInteger", head.getFunction().getName()); + blockingTree.toString(); + } + + private IArguments getArguments() throws ZinggClientException { + String configFilePath = getClass().getResource("../../testFebrl/config.json").getFile(); + + IArguments args = argumentsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); + + List fdList = getFieldDefList(); + + args.setFieldDefinition(fdList); + return args; + } + + private List getFieldDefList() { + List fdList = new ArrayList(4); + + FieldDefinition idFD = new FieldDefinition(); + idFD.setDataType("integer"); + idFD.setFieldName("id"); + ArrayList matchTypelistId = new ArrayList(); + matchTypelistId.add(MatchType.DONT_USE); + idFD.setMatchType(matchTypelistId); + fdList.add(idFD); + + ArrayList matchTypelistFuzzy = new ArrayList(); + matchTypelistFuzzy.add(MatchType.FUZZY); + + + FieldDefinition yearFD = new FieldDefinition(); + yearFD.setDataType("integer"); + yearFD.setFieldName("year"); + yearFD.setMatchType(matchTypelistFuzzy); + fdList.add(yearFD); + + FieldDefinition eventFD = new FieldDefinition(); + eventFD.setDataType("string"); + eventFD.setFieldName("event"); + eventFD.setMatchType(matchTypelistFuzzy); + fdList.add(eventFD); + + FieldDefinition commentFD = new FieldDefinition(); + commentFD.setDataType("string"); + commentFD.setFieldName("comment"); + commentFD.setMatchType(matchTypelistFuzzy); + fdList.add(commentFD); + return fdList; + } + +} diff --git a/common/core/src/test/java/zingg/data/Constant.java b/common/core/src/test/java/zingg/data/Constant.java new file mode 100644 index 000000000..8d8af0e40 --- /dev/null +++ b/common/core/src/test/java/zingg/data/Constant.java @@ -0,0 +1,135 @@ +package zingg.data; + +import zingg.model.Event; +import zingg.model.EventCluster; + +import java.util.ArrayList; +import java.util.List; + +public class Constant { + public static List createSampleEventData() { + + int row_id = 1; + List sample = new ArrayList<>(); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add((new Event(row_id++, 1942, "quit N", "Mahatma"))); + sample.add((new Event(row_id++, 1919, "JallianWal", "Punjb"))); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id++, 1942, "quit Natin", "Mahaatma")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "Quit Bharat", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidence", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit N", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id++, 1942, "quit Natin", "Mahaatma")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "Quit Bharat", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidence", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit N", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id++, 1942, "quit Natin", "Mahaatma")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "Quit Bharat", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidence", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit N", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id, 1942, "quit Natin", "Mahaatma")); + + return sample; + } + + public static List createSampleClusterEventData() { + + int row_id = 1; + List sample = new ArrayList<>(); + sample.add(new EventCluster(row_id++, 1942, "quit Nation", "Mahatma",1942, "quit Nation", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma",1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + + return sample; + } +} diff --git a/common/core/src/test/java/zingg/model/Event.java b/common/core/src/test/java/zingg/model/Event.java new file mode 100644 index 000000000..c7feb391d --- /dev/null +++ b/common/core/src/test/java/zingg/model/Event.java @@ -0,0 +1,15 @@ +package zingg.model; + +public class Event { + public final Integer id; + public final Integer year; + public final String event; + public final String comment; + + public Event(Integer id, Integer year, String event, String comment) { + this.id = id; + this.year = year; + this.event = event; + this.comment = comment; + } +} diff --git a/common/core/src/test/java/zingg/model/EventCluster.java b/common/core/src/test/java/zingg/model/EventCluster.java new file mode 100644 index 000000000..7cca1a082 --- /dev/null +++ b/common/core/src/test/java/zingg/model/EventCluster.java @@ -0,0 +1,23 @@ +package zingg.model; + +public class EventCluster { + public final Integer id; + public final Integer year; + public final String event; + public final String comment; + public final Integer z_year; + public final String z_event; + public final String z_comment; + public final Long z_zid; + + public EventCluster(Integer id, Integer year, String event, String comment, Integer z_year, String z_event, String z_comment, Long z_zid) { + this.id = id; + this.year = year; + this.event = event; + this.comment = comment; + this.z_year = z_year; + this.z_event = z_event; + this.z_comment = z_comment; + this.z_zid = z_zid; + } +} \ No newline at end of file diff --git a/spark/core/src/test/java/zingg/block/TestBlock.java b/spark/core/src/test/java/zingg/block/TestBlock.java deleted file mode 100644 index 17cbdb93a..000000000 --- a/spark/core/src/test/java/zingg/block/TestBlock.java +++ /dev/null @@ -1,256 +0,0 @@ -package zingg.block; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.junit.jupiter.api.Test; - -import zingg.common.client.ArgumentsUtil; -import zingg.common.client.FieldDefinition; -import zingg.common.client.IArguments; -import zingg.common.client.MatchType; -import zingg.common.client.ZFrame; -import zingg.common.client.ZinggClientException; -import zingg.common.core.block.Canopy; -import zingg.common.core.block.Tree; -import zingg.spark.client.SparkFrame; -import zingg.spark.core.executor.ZinggSparkTester; -import zingg.spark.core.util.SparkBlockingTreeUtil; -import zingg.spark.core.util.SparkHashUtil; - -public class TestBlock extends ZinggSparkTester { - - @Test - public void testTree() throws Throwable { - - ZFrame, Row, Column> testData = getTestData(); - - ZFrame, Row, Column> posDf = getPosData(); - - IArguments args = getArguments(); - - // form tree - SparkBlockingTreeUtil blockingTreeUtil = new SparkBlockingTreeUtil(spark, zsCTX.getPipeUtil()); - SparkHashUtil hashUtil = new SparkHashUtil(spark); - - Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(testData, posDf, 0.5, -1, - args, hashUtil.getHashFunctionList()); - - // primary deciding is unique year so identityInteger should have been picked - Canopy head = blockingTree.getHead(); - assertEquals("identityInteger", head.getFunction().getName()); - blockingTree.toString(); - - } - - StructType testDataSchema = new StructType(new StructField[] { - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("year", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("event", DataTypes.StringType, false, Metadata.empty()), - new StructField("comment", DataTypes.StringType, false, Metadata.empty())} - ); - - StructType schemaPos = new StructType(new StructField[] { - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("year", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("event", DataTypes.StringType, false, Metadata.empty()), - new StructField("comment", DataTypes.StringType, false, Metadata.empty()), - new StructField("z_year", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("z_event", DataTypes.StringType, false, Metadata.empty()), - new StructField("z_comment", DataTypes.StringType, false, Metadata.empty()), - new StructField("z_zid", DataTypes.StringType, false, Metadata.empty())} - ); - - - - - private IArguments getArguments() throws ZinggClientException { - String configFilePath = getClass().getResource("../../testFebrl/config.json").getFile(); - - IArguments args = argsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); - - List fdList = getFieldDefList(); - - args.setFieldDefinition(fdList); - return args; - } - - private List getFieldDefList() { - List fdList = new ArrayList(4); - - FieldDefinition idFD = new FieldDefinition(); - idFD.setDataType("integer"); - idFD.setFieldName("id"); - ArrayList matchTypelistId = new ArrayList(); - matchTypelistId.add(MatchType.DONT_USE); - idFD.setMatchType(matchTypelistId); - fdList.add(idFD); - - ArrayList matchTypelistFuzzy = new ArrayList(); - matchTypelistFuzzy.add(MatchType.FUZZY); - - - FieldDefinition yearFD = new FieldDefinition(); - yearFD.setDataType("integer"); - yearFD.setFieldName("year"); - yearFD.setMatchType(matchTypelistFuzzy); - fdList.add(yearFD); - - FieldDefinition eventFD = new FieldDefinition(); - eventFD.setDataType("string"); - eventFD.setFieldName("event"); - eventFD.setMatchType(matchTypelistFuzzy); - fdList.add(eventFD); - - FieldDefinition commentFD = new FieldDefinition(); - commentFD.setDataType("string"); - commentFD.setFieldName("comment"); - commentFD.setMatchType(matchTypelistFuzzy); - fdList.add(commentFD); - return fdList; - } - - public SparkFrame getTestData() { - int row_id = 1; - // Create a DataFrame containing test data - Row[] data = { - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma") - }; - - return new SparkFrame( - spark.createDataFrame(Arrays.asList(data), - testDataSchema)); - - } - - private SparkFrame getPosData() { - int row_id = 1000; - // Create positive matching data - Row[] posData = { - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma",new Integer(1942), "quit Nation", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2") - }; - return new SparkFrame(spark.createDataFrame(Arrays.asList(posData), schemaPos)); - } - - -} diff --git a/spark/core/src/test/java/zingg/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/block/TestSparkBlock.java new file mode 100644 index 000000000..108973ba3 --- /dev/null +++ b/spark/core/src/test/java/zingg/block/TestSparkBlock.java @@ -0,0 +1,69 @@ +package zingg.block; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import zingg.common.client.IArguments; +import zingg.common.client.util.WithSession; +import zingg.spark.client.util.SparkDFObjectUtil; +import zingg.spark.client.util.WithSparkSession; +import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.util.SparkBlockingTreeUtil; +import zingg.spark.core.util.SparkHashUtil; + +public class TestSparkBlock extends TestBlockBase, Row, Column, DataType> { + + public static final Log LOG = LogFactory.getLog(TestSparkBlock.class); + public static IArguments args; + public static JavaSparkContext ctx; + public static ZinggSparkContext zsCTX; + public static SparkSession spark; + public static WithSession withSession; + + public TestSparkBlock() { + super(new SparkDFObjectUtil(withSession), new SparkHashUtil(spark), new SparkBlockingTreeUtil(spark, zsCTX.getPipeUtil())); + } + + @BeforeAll + public static void setup() { + setUpSpark(); + } + + protected static void setUpSpark() { + try { + spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + ctx = new JavaSparkContext(spark.sparkContext()); + withSession = new WithSparkSession(); + withSession.setSession(spark); + zsCTX = new ZinggSparkContext(); + zsCTX.init(spark); + } catch (Throwable e) { + if (LOG.isDebugEnabled()) + e.printStackTrace(); + LOG.info("Problem in spark env setup"); + } + } + + @AfterAll + public static void teardown() { + if (ctx != null) { + ctx.stop(); + ctx = null; + } + if (spark != null) { + spark.stop(); + spark = null; + } + } +} From b293e8dcccaa1cfd56b27ac9355c8db5c350b0df Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 12:40:22 +0530 Subject: [PATCH 162/219] restructured packaging --- .../java/zingg/{ => common/core}/block/TestBlockBase.java | 4 +--- .../test/java/zingg/{ => common/core}/block/TestTree.java | 6 +----- .../java/zingg/{ => common/core}/block/TestSparkBlock.java | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) rename common/core/src/test/java/zingg/{ => common/core}/block/TestBlockBase.java (97%) rename common/core/src/test/java/zingg/{ => common/core}/block/TestTree.java (79%) rename spark/core/src/test/java/zingg/{ => common/core}/block/TestSparkBlock.java (98%) diff --git a/common/core/src/test/java/zingg/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java similarity index 97% rename from common/core/src/test/java/zingg/block/TestBlockBase.java rename to common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 9f1593790..18288f2c3 100644 --- a/common/core/src/test/java/zingg/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -1,4 +1,4 @@ -package zingg.block; +package zingg.common.core.block; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -14,8 +14,6 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.DFObjectUtil; -import zingg.common.core.block.Canopy; -import zingg.common.core.block.Tree; import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.HashUtil; import zingg.model.Event; diff --git a/common/core/src/test/java/zingg/block/TestTree.java b/common/core/src/test/java/zingg/common/core/block/TestTree.java similarity index 79% rename from common/core/src/test/java/zingg/block/TestTree.java rename to common/core/src/test/java/zingg/common/core/block/TestTree.java index 81d5044b6..93898c105 100644 --- a/common/core/src/test/java/zingg/block/TestTree.java +++ b/common/core/src/test/java/zingg/common/core/block/TestTree.java @@ -1,11 +1,7 @@ -package zingg.block; +package zingg.common.core.block; import org.junit.jupiter.api.*; -import zingg.common.core.block.Tree; - -import static org.junit.jupiter.api.Assertions.*; - public class TestTree { @Test diff --git a/spark/core/src/test/java/zingg/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java similarity index 98% rename from spark/core/src/test/java/zingg/block/TestSparkBlock.java rename to spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java index 108973ba3..26f5a1652 100644 --- a/spark/core/src/test/java/zingg/block/TestSparkBlock.java +++ b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java @@ -1,4 +1,4 @@ -package zingg.block; +package zingg.common.core.block; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; From ed5403ece7b14e0b7fb18329c680c5495ae48dd3 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 13:03:02 +0530 Subject: [PATCH 163/219] created model classes for testing stop words --- .../common/core/block/TestBlockBase.java | 6 +- .../{ => common/core}/data/Constant.java | 64 ++++++++++++++++++- .../zingg/{ => common/core}/model/Event.java | 2 +- .../{ => common/core}/model/EventCluster.java | 2 +- .../zingg/common/core/model/SchemaActual.java | 26 ++++++++ .../common/core/model/SchemaOriginal.java | 17 +++++ .../core/preprocess/TestSparkStopWords.java | 4 ++ 7 files changed, 113 insertions(+), 8 deletions(-) rename common/core/src/test/java/zingg/{ => common/core}/data/Constant.java (76%) rename common/core/src/test/java/zingg/{ => common/core}/model/Event.java (91%) rename common/core/src/test/java/zingg/{ => common/core}/model/EventCluster.java (95%) create mode 100644 common/core/src/test/java/zingg/common/core/model/SchemaActual.java create mode 100644 common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java create mode 100644 spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 18288f2c3..a8f746c47 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -16,9 +16,9 @@ import zingg.common.client.util.DFObjectUtil; import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.HashUtil; -import zingg.model.Event; -import zingg.model.EventCluster; -import zingg.data.Constant; +import zingg.common.core.model.Event; +import zingg.common.core.model.EventCluster; +import zingg.common.core.data.Constant; public abstract class TestBlockBase { diff --git a/common/core/src/test/java/zingg/data/Constant.java b/common/core/src/test/java/zingg/common/core/data/Constant.java similarity index 76% rename from common/core/src/test/java/zingg/data/Constant.java rename to common/core/src/test/java/zingg/common/core/data/Constant.java index 8d8af0e40..eee323c70 100644 --- a/common/core/src/test/java/zingg/data/Constant.java +++ b/common/core/src/test/java/zingg/common/core/data/Constant.java @@ -1,7 +1,9 @@ -package zingg.data; +package zingg.common.core.data; -import zingg.model.Event; -import zingg.model.EventCluster; +import zingg.common.core.model.Event; +import zingg.common.core.model.EventCluster; +import zingg.common.core.model.SchemaActual; +import zingg.common.core.model.SchemaOriginal; import java.util.ArrayList; import java.util.List; @@ -132,4 +134,60 @@ public static List createSampleClusterEventData() { return sample; } + + public static List getData1Original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + + return sample; + } + + public static List getData1Actual() { + + List sample = new ArrayList<>(); + sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + "The zingg spark application", "two", "Yes. good application", "test")); + sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + "It very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + "It written java scala", "four", "", "test")); + sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + "thank", "test")); + + return sample; + } + + public static List getData2original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + + return sample; + } + + public static List getData2Actual() { + + List sample = new ArrayList<>(); + sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + "The zingg spark application", "two", "Yes. good application", "test")); + sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + "It very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + "It written java scala", "four", "", "test")); + sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + "thank", "test")); + + return sample; + } } diff --git a/common/core/src/test/java/zingg/model/Event.java b/common/core/src/test/java/zingg/common/core/model/Event.java similarity index 91% rename from common/core/src/test/java/zingg/model/Event.java rename to common/core/src/test/java/zingg/common/core/model/Event.java index c7feb391d..0bad3e883 100644 --- a/common/core/src/test/java/zingg/model/Event.java +++ b/common/core/src/test/java/zingg/common/core/model/Event.java @@ -1,4 +1,4 @@ -package zingg.model; +package zingg.common.core.model; public class Event { public final Integer id; diff --git a/common/core/src/test/java/zingg/model/EventCluster.java b/common/core/src/test/java/zingg/common/core/model/EventCluster.java similarity index 95% rename from common/core/src/test/java/zingg/model/EventCluster.java rename to common/core/src/test/java/zingg/common/core/model/EventCluster.java index 7cca1a082..f4697cf28 100644 --- a/common/core/src/test/java/zingg/model/EventCluster.java +++ b/common/core/src/test/java/zingg/common/core/model/EventCluster.java @@ -1,4 +1,4 @@ -package zingg.model; +package zingg.common.core.model; public class EventCluster { public final Integer id; diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaActual.java b/common/core/src/test/java/zingg/common/core/model/SchemaActual.java new file mode 100644 index 000000000..762420435 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/SchemaActual.java @@ -0,0 +1,26 @@ +package zingg.common.core.model; + +public class SchemaActual { + public final String z_cluster; + public final String z_zid; + public final String z_prediction; + public final String z_score; + public final String z_isMatch; + public final String field1; + public final String field2; + public final String field3; + public final String z_zsource; + + public SchemaActual(String z_cluster, String z_zid, String z_prediction, String z_score, String z_isMatch, + String field1, String field2, String field3, String z_zsource) { + this.z_cluster = z_cluster; + this.z_zid = z_zid; + this.z_prediction = z_prediction; + this.z_score = z_score; + this.z_isMatch = z_isMatch; + this.field1 = field1; + this.field2 = field2; + this.field3 = field3; + this.z_zsource = z_zsource; + } +} diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java b/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java new file mode 100644 index 000000000..25d55eca2 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java @@ -0,0 +1,17 @@ +package zingg.common.core.model; + +public class SchemaOriginal { + public final String z_zid; + public final String field1; + public final String field2; + public final String field3; + public final String z_zsource; + + public SchemaOriginal(String z_zid, String field1, String field2, String field3, String z_zsource) { + this.z_zid = z_zid; + this.field1 = field1; + this.field2 = field2; + this.field3 = field3; + this.z_zsource = z_zsource; + } +} diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java new file mode 100644 index 000000000..d7eaf27a4 --- /dev/null +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -0,0 +1,4 @@ +package zingg.common.core.preprocess; + +public class TestSparkStopWords { +} From 23ee4f079a49e0089c8457bee035c328af7db267 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 14:15:11 +0530 Subject: [PATCH 164/219] initial changes --- .../java/zingg/common/core/data/Constant.java | 79 +++++++- .../java/zingg/common/core/model/Schema.java | 9 + .../common/core/preprocess/TestStopWords.java | 173 ++++++++++-------- 3 files changed, 184 insertions(+), 77 deletions(-) create mode 100644 common/core/src/test/java/zingg/common/core/model/Schema.java diff --git a/common/core/src/test/java/zingg/common/core/data/Constant.java b/common/core/src/test/java/zingg/common/core/data/Constant.java index eee323c70..0418f9810 100644 --- a/common/core/src/test/java/zingg/common/core/data/Constant.java +++ b/common/core/src/test/java/zingg/common/core/data/Constant.java @@ -2,6 +2,7 @@ import zingg.common.core.model.Event; import zingg.common.core.model.EventCluster; +import zingg.common.core.model.Schema; import zingg.common.core.model.SchemaActual; import zingg.common.core.model.SchemaOriginal; @@ -135,7 +136,77 @@ public static List createSampleClusterEventData() { return sample; } - public static List getData1Original() { + public static List getData1Original() { + + List sample = new ArrayList<>(); + sample.add(new Schema("The zingg is a Spark application")); + sample.add(new Schema("It is very popular in data Science")); + sample.add(new Schema("It is written in Java and Scala")); + sample.add(new Schema("Best of luck to zingg")); + + return sample; + } + + public static List getData1Expected() { + + List sample = new ArrayList<>(); + sample.add(new Schema("zingg spark application")); + sample.add(new Schema("very popular in data science")); + sample.add(new Schema("written in java and scala")); + sample.add(new Schema("best luck to zingg")); + + return sample; + } + + public static List getData2Original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + + return sample; + } + + public static List getData2Expected() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + + return sample; + } + + public static List getData3Original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + + return sample; + } + + public static List getData3Expected() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + + return sample; + } + + public static List getData4original() { List sample = new ArrayList<>(); sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", @@ -148,7 +219,7 @@ public static List getData1Original() { return sample; } - public static List getData1Actual() { + public static List getData4Expected() { List sample = new ArrayList<>(); sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", @@ -163,7 +234,7 @@ public static List getData1Actual() { return sample; } - public static List getData2original() { + public static List getData5Original() { List sample = new ArrayList<>(); sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", @@ -176,7 +247,7 @@ public static List getData2original() { return sample; } - public static List getData2Actual() { + public static List getData5Actual() { List sample = new ArrayList<>(); sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", diff --git a/common/core/src/test/java/zingg/common/core/model/Schema.java b/common/core/src/test/java/zingg/common/core/model/Schema.java new file mode 100644 index 000000000..c608bd370 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/Schema.java @@ -0,0 +1,9 @@ +package zingg.common.core.model; + +public class Schema { + public final String statement; + + public Schema(String statement) { + this.statement = statement; + } +} diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java index c9ace5f3f..d26c3db4a 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java @@ -19,44 +19,62 @@ import org.junit.jupiter.api.Test; import zingg.common.client.Arguments; +import zingg.common.client.ArgumentsUtil; import zingg.common.client.FieldDefinition; import zingg.common.client.IArguments; import zingg.common.client.MatchType; +import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; +import zingg.common.client.util.DFObjectUtil; +import zingg.common.core.data.Constant; +import zingg.common.core.model.Event; +import zingg.common.core.model.Schema; +import zingg.common.core.model.SchemaOriginal; import zingg.spark.client.SparkFrame; import zingg.spark.core.executor.ZinggSparkTester; import zingg.spark.core.preprocess.SparkStopWordsRemover; -public class TestStopWords extends ZinggSparkTester{ +public class TestStopWords{ public static final Log LOG = LogFactory.getLog(TestStopWords.class); + public final DFObjectUtil dfObjectUtil; + public final StopWordsRemover stopWordsRemover; + public IArguments args = new Arguments(); + + public TestStopWords(DFObjectUtil dfObjectUtil, StopWordsRemover stopWordsRemover) { + this.dfObjectUtil = dfObjectUtil; + this.stopWordsRemover = stopWordsRemover; + } @DisplayName ("Test Stop Words removal from Single column dataset") @Test - public void testStopWordsSingleColumn() throws ZinggClientException { + public void testStopWordsSingleColumn() throws ZinggClientException, Exception { - StructType schema = new StructType(new StructField[] { - new StructField("statement", DataTypes.StringType, false, Metadata.empty()) - }); - - Dataset datasetOriginal = spark.createDataFrame( - Arrays.asList( - RowFactory.create("The zingg is a Spark application"), - RowFactory.create("It is very popular in data Science"), - RowFactory.create("It is written in Java and Scala"), - RowFactory.create("Best of luck to zingg")), - schema); +// StructType schema = new StructType(new StructField[] { +// new StructField("statement", DataTypes.StringType, false, Metadata.empty()) +// }); +// +// Dataset datasetOriginal = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("The zingg is a Spark application"), +// RowFactory.create("It is very popular in data Science"), +// RowFactory.create("It is written in Java and Scala"), +// RowFactory.create("Best of luck to zingg")), +// schema); String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); - Dataset datasetExpected = spark.createDataFrame( - Arrays.asList( - RowFactory.create("zingg spark application"), - RowFactory.create("very popular in data science"), - RowFactory.create("written in java and scala"), - RowFactory.create("best luck to zingg")), - schema); +// Dataset datasetExpected = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("zingg spark application"), +// RowFactory.create("very popular in data science"), +// RowFactory.create("written in java and scala"), +// RowFactory.create("best luck to zingg")), +// schema); + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); List fdList = new ArrayList(4); @@ -72,63 +90,68 @@ public void testStopWordsSingleColumn() throws ZinggClientException { IArguments stmtArgs = new Arguments(); stmtArgs.setFieldDefinition(fdList); - StopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,stmtArgs); +// StopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,stmtArgs); - stopWordsObj.preprocessForStopWords(new SparkFrame(datasetOriginal)); + stopWordsRemover.preprocessForStopWords(zFrameOriginal); System.out.println("datasetOriginal.show() : "); - datasetOriginal.show(); - SparkFrame datasetWithoutStopWords = (SparkFrame)stopWordsObj.removeStopWordsFromDF(new SparkFrame(datasetOriginal),"statement",stopWords); + zFrameOriginal.show(); + ZFrame zFrameWithoutStopWords = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); System.out.println("datasetWithoutStopWords.show() : "); - datasetWithoutStopWords.show(); + zFrameWithoutStopWords.show(); - assertTrue(datasetExpected.except(datasetWithoutStopWords.df()).isEmpty()); - assertTrue(datasetWithoutStopWords.df().except(datasetExpected).isEmpty()); +// assertTrue(zFrameExpected.except(datasetWithoutStopWords.df()).isEmpty()); +// assertTrue(datasetWithoutStopWords.df().except(datasetExpected).isEmpty()); } @Test - public void testRemoveStopWordsFromDataset() throws ZinggClientException { - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); + public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { +// StructType schemaOriginal = new StructType(new StructField[] { +// new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), +// new StructField("field1", DataTypes.StringType, false, Metadata.empty()), +// new StructField("field2", DataTypes.StringType, false, Metadata.empty()), +// new StructField("field3", DataTypes.StringType, false, Metadata.empty()), +// new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) +// }); - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), - schemaOriginal); +// Dataset original = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "The zingg is a spark application", "two", +// "Yes. a good application", "test"), +// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", +// "test"), +// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), +// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), +// schemaOriginal); - Dataset datasetExpected = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), - RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), - RowFactory.create("30", "written java scala", "four", "", "test"), - RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), - schemaOriginal); +// Dataset datasetExpected = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), +// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), +// RowFactory.create("30", "written java scala", "four", "", "test"), +// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), +// schemaOriginal); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData2Original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData2Expected(), SchemaOriginal.class); + String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWords.csv").getFile(); FieldDefinition fd = new FieldDefinition(); fd.setStopWords(stopWordsFileName); fd.setFieldName("field1"); List fieldDefinitionList = Arrays.asList(fd); - args.setFieldDefinition(fieldDefinitionList); +// args.setFieldDefinition(fieldDefinitionList); - SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); +// SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); + + ZFrame zFrameNew = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - Dataset newDataSet = ((SparkFrame)(stopWordsObj.preprocessForStopWords(new SparkFrame(original)))).df(); - assertTrue(datasetExpected.except(newDataSet).isEmpty()); - assertTrue(newDataSet.except(datasetExpected).isEmpty()); +// Dataset newDataSet = ((SparkFrame)(stopWordsRemover.preprocessForStopWords(new SparkFrame(original)))).df(); +// assertTrue(datasetExpected.except(newDataSet).isEmpty()); +// assertTrue(newDataSet.except(datasetExpected).isEmpty()); } @Test - public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException { + public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { StructType schemaOriginal = new StructType(new StructField[] { new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), new StructField("field1", DataTypes.StringType, false, Metadata.empty()), @@ -137,23 +160,27 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) }); - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), - schemaOriginal); +// Dataset original = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "The zingg is a spark application", "two", +// "Yes. a good application", "test"), +// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", +// "test"), +// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), +// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), +// schemaOriginal); +// +// Dataset datasetExpected = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), +// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), +// RowFactory.create("30", "written java scala", "four", "", "test"), +// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), +// schemaOriginal); + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData3Original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData3Expected(), SchemaOriginal.class); - Dataset datasetExpected = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), - RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), - RowFactory.create("30", "written java scala", "four", "", "test"), - RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), - schemaOriginal); String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWordsWithoutHeader.csv").getFile(); FieldDefinition fd = new FieldDefinition(); fd.setStopWords(stopWordsFileName); From 24ac62cb77f3e53193b7da3d5e9072ae61bf5d78 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:15:18 +0530 Subject: [PATCH 165/219] initial changes stop words --- .../core/preprocess/TestSparkStopWords.java | 67 ++++- .../common/core/preprocess/TestStopWords.java | 264 +++--------------- .../core/util/SampleStopWordRemover.java | 63 +++++ 3 files changed, 168 insertions(+), 226 deletions(-) create mode 100644 spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java index d7eaf27a4..5d925cd33 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -1,4 +1,69 @@ package zingg.common.core.preprocess; -public class TestSparkStopWords { +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.WithSession; +import zingg.common.core.util.SampleStopWordRemover; +import zingg.spark.client.util.SparkDFObjectUtil; +import zingg.spark.client.util.WithSparkSession; +import zingg.spark.core.context.ZinggSparkContext; + +public class TestSparkStopWords extends TestStopWords, Row, Column, DataType> { + + public static final Log LOG = LogFactory.getLog(TestSparkStopWords.class); + public static IArguments args; + public static JavaSparkContext ctx; + public static SparkSession spark; + public static ZinggSparkContext zsCTX; + public static WithSession withSession; + + @BeforeAll + public static void setup() { + setUpSpark(); + } + + public TestSparkStopWords() throws ZinggClientException { + super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, args), zsCTX); + } + + protected static void setUpSpark() { + try { + spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + ctx = new JavaSparkContext(spark.sparkContext()); + withSession = new WithSparkSession(); + withSession.setSession(spark); + zsCTX = new ZinggSparkContext(); + zsCTX.init(spark); + } catch (Throwable e) { + if (LOG.isDebugEnabled()) + e.printStackTrace(); + LOG.info("Problem in spark env setup"); + } + } + + @AfterAll + public static void teardown() { + if (ctx != null) { + ctx.stop(); + ctx = null; + } + if (spark != null) { + spark.stop(); + spark = null; + } + } } diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java index d26c3db4a..ac8255cdc 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java @@ -27,284 +27,98 @@ import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; +import zingg.common.core.context.Context; import zingg.common.core.data.Constant; import zingg.common.core.model.Event; import zingg.common.core.model.Schema; +import zingg.common.core.model.SchemaActual; import zingg.common.core.model.SchemaOriginal; import zingg.spark.client.SparkFrame; +import zingg.spark.core.context.ZinggSparkContext; import zingg.spark.core.executor.ZinggSparkTester; import zingg.spark.core.preprocess.SparkStopWordsRemover; -public class TestStopWords{ +public abstract class TestStopWords { public static final Log LOG = LogFactory.getLog(TestStopWords.class); - public final DFObjectUtil dfObjectUtil; - public final StopWordsRemover stopWordsRemover; - public IArguments args = new Arguments(); + private final DFObjectUtil dfObjectUtil; + private final List> stopWordsRemovers; + private final Context context; - public TestStopWords(DFObjectUtil dfObjectUtil, StopWordsRemover stopWordsRemover) { + public TestStopWords(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, + Context context) { this.dfObjectUtil = dfObjectUtil; - this.stopWordsRemover = stopWordsRemover; + this.stopWordsRemovers = stopWordsRemovers; + this.context = context; } @DisplayName ("Test Stop Words removal from Single column dataset") @Test public void testStopWordsSingleColumn() throws ZinggClientException, Exception { - -// StructType schema = new StructType(new StructField[] { -// new StructField("statement", DataTypes.StringType, false, Metadata.empty()) -// }); -// -// Dataset datasetOriginal = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("The zingg is a Spark application"), -// RowFactory.create("It is very popular in data Science"), -// RowFactory.create("It is written in Java and Scala"), -// RowFactory.create("Best of luck to zingg")), -// schema); String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); -// Dataset datasetExpected = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("zingg spark application"), -// RowFactory.create("very popular in data science"), -// RowFactory.create("written in java and scala"), -// RowFactory.create("best luck to zingg")), -// schema); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); - - List fdList = new ArrayList(4); - - ArrayList matchTypelistFuzzy = new ArrayList(); - matchTypelistFuzzy.add(MatchType.FUZZY); - - FieldDefinition eventFD = new FieldDefinition(); - eventFD.setDataType("string"); - eventFD.setFieldName("statement"); - eventFD.setMatchType(matchTypelistFuzzy); - fdList.add(eventFD); - - IArguments stmtArgs = new Arguments(); - stmtArgs.setFieldDefinition(fdList); - -// StopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,stmtArgs); + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(0); stopWordsRemover.preprocessForStopWords(zFrameOriginal); - System.out.println("datasetOriginal.show() : "); - zFrameOriginal.show(); - ZFrame zFrameWithoutStopWords = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); - System.out.println("datasetWithoutStopWords.show() : "); - zFrameWithoutStopWords.show(); - -// assertTrue(zFrameExpected.except(datasetWithoutStopWords.df()).isEmpty()); -// assertTrue(datasetWithoutStopWords.df().except(datasetExpected).isEmpty()); + ZFrame newZFrame = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); + + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); } @Test public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { -// StructType schemaOriginal = new StructType(new StructField[] { -// new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), -// new StructField("field1", DataTypes.StringType, false, Metadata.empty()), -// new StructField("field2", DataTypes.StringType, false, Metadata.empty()), -// new StructField("field3", DataTypes.StringType, false, Metadata.empty()), -// new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) -// }); - -// Dataset original = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "The zingg is a spark application", "two", -// "Yes. a good application", "test"), -// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", -// "test"), -// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), -// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), -// schemaOriginal); -// Dataset datasetExpected = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), -// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), -// RowFactory.create("30", "written java scala", "four", "", "test"), -// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), -// schemaOriginal); ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData2Original(), SchemaOriginal.class); ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData2Expected(), SchemaOriginal.class); - - String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWords.csv").getFile(); - FieldDefinition fd = new FieldDefinition(); - fd.setStopWords(stopWordsFileName); - fd.setFieldName("field1"); - - List fieldDefinitionList = Arrays.asList(fd); -// args.setFieldDefinition(fieldDefinitionList); -// SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); - - ZFrame zFrameNew = stopWordsRemover.preprocessForStopWords(zFrameOriginal); + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(1); + ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); -// Dataset newDataSet = ((SparkFrame)(stopWordsRemover.preprocessForStopWords(new SparkFrame(original)))).df(); -// assertTrue(datasetExpected.except(newDataSet).isEmpty()); -// assertTrue(newDataSet.except(datasetExpected).isEmpty()); + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); } @Test public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); - -// Dataset original = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "The zingg is a spark application", "two", -// "Yes. a good application", "test"), -// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", -// "test"), -// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), -// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), -// schemaOriginal); -// -// Dataset datasetExpected = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), -// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), -// RowFactory.create("30", "written java scala", "four", "", "test"), -// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), -// schemaOriginal); ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData3Original(), SchemaOriginal.class); ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData3Expected(), SchemaOriginal.class); - String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWordsWithoutHeader.csv").getFile(); - FieldDefinition fd = new FieldDefinition(); - fd.setStopWords(stopWordsFileName); - fd.setFieldName("field1"); + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); + ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - List fieldDefinitionList = Arrays.asList(fd); - args.setFieldDefinition(fieldDefinitionList); - - SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); - - System.out.println("testStopWordColumnMissingFromStopWordFile : orginal "); - original.show(200); - Dataset newDataSet = ((SparkFrame)(stopWordsObj.preprocessForStopWords(new SparkFrame(original)))).df(); - System.out.println("testStopWordColumnMissingFromStopWordFile : newDataSet "); - newDataSet.show(200); - System.out.println("testStopWordColumnMissingFromStopWordFile : datasetExpected "); - datasetExpected.show(200); - assertTrue(datasetExpected.except(newDataSet).isEmpty()); - assertTrue(newDataSet.except(datasetExpected).isEmpty()); + assertTrue(zFrameExpected.except(newDataSet).isEmpty()); + assertTrue(newDataSet.except(zFrameExpected).isEmpty()); } @Test - public void testForOriginalDataAfterPostprocess() { - StructType schemaActual = new StructType(new StructField[] { - new StructField(ColName.CLUSTER_COLUMN, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.PREDICTION_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SCORE_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.MATCH_FLAG_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); - - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); + public void testForOriginalDataAfterPostProcess() throws Exception { - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in data science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg", "Five", "thank you", "test")), - schemaOriginal); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData4original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData4Expected(), SchemaActual.class); - Dataset actual = spark.createDataFrame( - Arrays.asList( - RowFactory.create("1648811730857:10", "10", "1.0", "0.555555", "-1", - "The zingg spark application", "two", "Yes. good application", "test"), - RowFactory.create("1648811730857:20", "20", "1.0", "1.0", "-1", - "It very popular data science", "Three", "true indeed", "test"), - RowFactory.create("1648811730857:30", "30", "1.0", "0.999995", "-1", - "It written java scala", "four", "", "test"), - RowFactory.create("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", - "thank", "test")), - schemaActual); + ZFrame newZFrame = context.getDSUtil().postprocess(zFrameExpected, zFrameOriginal); - Dataset newDataset = ((SparkFrame)(zsCTX.getDSUtil().postprocess(new SparkFrame(actual), new SparkFrame(original)))).df(); - assertTrue(newDataset.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL).except(original).isEmpty()); - assertTrue(original.except(newDataset.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL)).isEmpty()); + assertTrue(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL).except(zFrameOriginal).isEmpty()); + assertTrue(zFrameOriginal.except(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL)).isEmpty()); } @Test - public void testOriginalDataAfterPostprocessLinked() { - StructType schemaActual = new StructType(new StructField[] { - new StructField(ColName.CLUSTER_COLUMN, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.PREDICTION_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SCORE_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.MATCH_FLAG_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); + public void testOriginalDataAfterPostProcessLinked() throws Exception { - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); - - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in data science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg", "Five", "thank you", "test")), - schemaOriginal); - - Dataset actual = spark.createDataFrame( - Arrays.asList( - RowFactory.create("1648811730857:10", "10", "1.0", "0.555555", "-1", - "The zingg spark application", "two", "Yes. good application", "test"), - RowFactory.create("1648811730857:20", "20", "1.0", "1.0", "-1", - "It very popular data science", "Three", "true indeed", "test"), - RowFactory.create("1648811730857:30", "30", "1.0", "0.999995", "-1", - "It written java scala", "four", "", "test"), - RowFactory.create("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", - "thank", "test")), - schemaActual); - - System.out.println("testOriginalDataAfterPostprocessLinked original :"); - original.show(200); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData5Original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData5Actual(), SchemaActual.class); - Dataset newDataset = ((SparkFrame)(zsCTX.getDSUtil().postprocessLinked(new SparkFrame(actual), new SparkFrame(original)))).df(); + ZFrame newZFrame = context.getDSUtil().postprocessLinked(zFrameExpected, zFrameOriginal); - System.out.println("testOriginalDataAfterPostprocessLinked newDataset :"); - newDataset.show(200); - - assertTrue(newDataset.select("field1", "field2", "field3").except(original.select("field1", "field2", "field3")).isEmpty()); - assertTrue(original.select("field1", "field2", "field3").except(newDataset.select("field1", "field2", "field3")).isEmpty()); + assertTrue(newZFrame.select("field1", "field2", "field3").except(zFrameOriginal.select("field1", "field2", "field3")).isEmpty()); + assertTrue(zFrameOriginal.select("field1", "field2", "field3").except(newZFrame.select("field1", "field2", "field3")).isEmpty()); } + } \ No newline at end of file diff --git a/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java b/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java new file mode 100644 index 000000000..fe839e593 --- /dev/null +++ b/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java @@ -0,0 +1,63 @@ +package zingg.common.core.util; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import zingg.common.client.Arguments; +import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; +import zingg.common.client.MatchType; +import zingg.common.client.ZinggClientException; +import zingg.common.core.preprocess.StopWordsRemover; +import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.preprocess.SparkStopWordsRemover; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class SampleStopWordRemover { + + public static List, Row, Column, DataType>> getStopWordRemovers(ZinggSparkContext zsCTX, + IArguments args) throws ZinggClientException { + + List, Row, Column, DataType>> sparkStopWordsRemovers = new ArrayList<>(); + + //add first stopWordRemover + List fdList = new ArrayList(4); + ArrayList matchTypelistFuzzy = new ArrayList(); + matchTypelistFuzzy.add(MatchType.FUZZY); + FieldDefinition eventFD = new FieldDefinition(); + eventFD.setDataType("string"); + eventFD.setFieldName("statement"); + eventFD.setMatchType(matchTypelistFuzzy); + fdList.add(eventFD); + IArguments stmtArgs = new Arguments(); + stmtArgs.setFieldDefinition(fdList); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX,stmtArgs)); + + //add second stopWordRemover + String stopWordsFileName1 = Objects.requireNonNull( + SampleStopWordRemover.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); + FieldDefinition fieldDefinition1 = new FieldDefinition(); + fieldDefinition1.setStopWords(stopWordsFileName1); + fieldDefinition1.setFieldName("field1"); + List fieldDefinitionList1 = List.of(fieldDefinition1); + args.setFieldDefinition(fieldDefinitionList1); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + + //add third stopWordRemover + String stopWordsFileName2 = Objects.requireNonNull( + SampleStopWordRemover.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); + FieldDefinition fieldDefinition2 = new FieldDefinition(); + fieldDefinition2.setStopWords(stopWordsFileName2); + fieldDefinition2.setFieldName("field1"); + List fieldDefinitionList2 = List.of(fieldDefinition2); + args.setFieldDefinition(fieldDefinitionList2); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + + return sparkStopWordsRemovers; + } +} From a6fa464b87580a8f0e0b50d9fe8970b064b81096 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:17:55 +0530 Subject: [PATCH 166/219] renamed TestStopWords to TestStopWordsBase --- ...Words.java => TestSparkStopWordsBase.java} | 11 ++++---- ...tStopWords.java => TestStopWordsBase.java} | 27 +++---------------- 2 files changed, 9 insertions(+), 29 deletions(-) rename spark/core/src/test/java/zingg/common/core/preprocess/{TestSparkStopWords.java => TestSparkStopWordsBase.java} (86%) rename spark/core/src/test/java/zingg/common/core/preprocess/{TestStopWords.java => TestStopWordsBase.java} (80%) diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java similarity index 86% rename from spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java rename to spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java index 5d925cd33..a2efce588 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java @@ -10,7 +10,7 @@ import org.apache.spark.sql.types.DataType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; -import zingg.common.client.IArguments; +import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.util.WithSession; import zingg.common.core.util.SampleStopWordRemover; @@ -18,10 +18,9 @@ import zingg.spark.client.util.WithSparkSession; import zingg.spark.core.context.ZinggSparkContext; -public class TestSparkStopWords extends TestStopWords, Row, Column, DataType> { +public class TestSparkStopWordsBase extends TestStopWordsBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkStopWords.class); - public static IArguments args; + public static final Log LOG = LogFactory.getLog(TestSparkStopWordsBase.class); public static JavaSparkContext ctx; public static SparkSession spark; public static ZinggSparkContext zsCTX; @@ -32,8 +31,8 @@ public static void setup() { setUpSpark(); } - public TestSparkStopWords() throws ZinggClientException { - super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, args), zsCTX); + public TestSparkStopWordsBase() throws ZinggClientException { + super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, new Arguments()), zsCTX); } protected static void setUpSpark() { diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java similarity index 80% rename from spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java rename to spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index ac8255cdc..0a56b778d 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -2,51 +2,32 @@ import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; -import zingg.common.client.Arguments; -import zingg.common.client.ArgumentsUtil; -import zingg.common.client.FieldDefinition; -import zingg.common.client.IArguments; -import zingg.common.client.MatchType; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.core.context.Context; import zingg.common.core.data.Constant; -import zingg.common.core.model.Event; import zingg.common.core.model.Schema; import zingg.common.core.model.SchemaActual; import zingg.common.core.model.SchemaOriginal; -import zingg.spark.client.SparkFrame; -import zingg.spark.core.context.ZinggSparkContext; -import zingg.spark.core.executor.ZinggSparkTester; -import zingg.spark.core.preprocess.SparkStopWordsRemover; -public abstract class TestStopWords { +public abstract class TestStopWordsBase { - public static final Log LOG = LogFactory.getLog(TestStopWords.class); + public static final Log LOG = LogFactory.getLog(TestStopWordsBase.class); private final DFObjectUtil dfObjectUtil; private final List> stopWordsRemovers; private final Context context; - public TestStopWords(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, - Context context) { + public TestStopWordsBase(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, + Context context) { this.dfObjectUtil = dfObjectUtil; this.stopWordsRemovers = stopWordsRemovers; this.context = context; From f6d08bd4b7d56c90919db885107d4405212cb31d Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:37:11 +0530 Subject: [PATCH 167/219] added interface StopWordRemoverUtility --- .../core/preprocess/TestStopWordsBase.java | 9 +++++--- .../core/util/StopWordRemoverUtility.java | 13 +++++++++++ ...WordsBase.java => TestSparkStopWords.java} | 11 +++++----- ....java => SparkStopWordRemoverUtility.java} | 22 +++++++++---------- 4 files changed, 35 insertions(+), 20 deletions(-) rename {spark => common}/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java (93%) create mode 100644 common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java rename spark/core/src/test/java/zingg/common/core/preprocess/{TestSparkStopWordsBase.java => TestSparkStopWords.java} (82%) rename spark/core/src/test/java/zingg/common/core/util/{SampleStopWordRemover.java => SparkStopWordRemoverUtility.java} (67%) diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java similarity index 93% rename from spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java rename to common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index 0a56b778d..ca14ef979 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -9,6 +9,7 @@ import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; +import zingg.common.client.Arguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; @@ -18,6 +19,7 @@ import zingg.common.core.model.Schema; import zingg.common.core.model.SchemaActual; import zingg.common.core.model.SchemaOriginal; +import zingg.common.core.util.StopWordRemoverUtility; public abstract class TestStopWordsBase { @@ -26,10 +28,11 @@ public abstract class TestStopWordsBase { private final List> stopWordsRemovers; private final Context context; - public TestStopWordsBase(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, - Context context) { + + public TestStopWordsBase(DFObjectUtil dfObjectUtil, StopWordRemoverUtility stopWordRemoverUtility, + Context context) throws ZinggClientException { this.dfObjectUtil = dfObjectUtil; - this.stopWordsRemovers = stopWordsRemovers; + this.stopWordsRemovers = stopWordRemoverUtility.getStopWordRemovers(context, new Arguments()); this.context = context; } diff --git a/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java b/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java new file mode 100644 index 000000000..04871a606 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java @@ -0,0 +1,13 @@ +package zingg.common.core.util; + +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; +import zingg.common.core.context.Context; +import zingg.common.core.preprocess.StopWordsRemover; + +import java.util.List; + +public interface StopWordRemoverUtility { + + List> getStopWordRemovers(Context context, IArguments arguments) throws ZinggClientException; +} diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java similarity index 82% rename from spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java rename to spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java index a2efce588..0ba570d03 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -10,17 +10,16 @@ import org.apache.spark.sql.types.DataType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; -import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.util.WithSession; -import zingg.common.core.util.SampleStopWordRemover; +import zingg.common.core.util.SparkStopWordRemoverUtility; import zingg.spark.client.util.SparkDFObjectUtil; import zingg.spark.client.util.WithSparkSession; import zingg.spark.core.context.ZinggSparkContext; -public class TestSparkStopWordsBase extends TestStopWordsBase, Row, Column, DataType> { +public class TestSparkStopWords extends TestStopWordsBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkStopWordsBase.class); + public static final Log LOG = LogFactory.getLog(TestSparkStopWords.class); public static JavaSparkContext ctx; public static SparkSession spark; public static ZinggSparkContext zsCTX; @@ -31,8 +30,8 @@ public static void setup() { setUpSpark(); } - public TestSparkStopWordsBase() throws ZinggClientException { - super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, new Arguments()), zsCTX); + public TestSparkStopWords() throws ZinggClientException { + super(new SparkDFObjectUtil(withSession), new SparkStopWordRemoverUtility(), zsCTX); } protected static void setUpSpark() { diff --git a/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java similarity index 67% rename from spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java rename to spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java index fe839e593..a3bb2a52c 100644 --- a/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java +++ b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java @@ -10,18 +10,18 @@ import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; +import zingg.common.core.context.Context; import zingg.common.core.preprocess.StopWordsRemover; -import zingg.spark.core.context.ZinggSparkContext; import zingg.spark.core.preprocess.SparkStopWordsRemover; import java.util.ArrayList; import java.util.List; import java.util.Objects; -public class SampleStopWordRemover { +public class SparkStopWordRemoverUtility implements StopWordRemoverUtility, Row, Column, DataType> { - public static List, Row, Column, DataType>> getStopWordRemovers(ZinggSparkContext zsCTX, - IArguments args) throws ZinggClientException { + @Override + public List, Row, Column, DataType>> getStopWordRemovers(Context, Row, Column, DataType> context, IArguments arguments) throws ZinggClientException { List, Row, Column, DataType>> sparkStopWordsRemovers = new ArrayList<>(); @@ -36,27 +36,27 @@ public static List, Row, Column, Dat fdList.add(eventFD); IArguments stmtArgs = new Arguments(); stmtArgs.setFieldDefinition(fdList); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX,stmtArgs)); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(context,stmtArgs)); //add second stopWordRemover String stopWordsFileName1 = Objects.requireNonNull( - SampleStopWordRemover.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); + StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); FieldDefinition fieldDefinition1 = new FieldDefinition(); fieldDefinition1.setStopWords(stopWordsFileName1); fieldDefinition1.setFieldName("field1"); List fieldDefinitionList1 = List.of(fieldDefinition1); - args.setFieldDefinition(fieldDefinitionList1); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + arguments.setFieldDefinition(fieldDefinitionList1); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(context, arguments)); //add third stopWordRemover String stopWordsFileName2 = Objects.requireNonNull( - SampleStopWordRemover.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); + StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); FieldDefinition fieldDefinition2 = new FieldDefinition(); fieldDefinition2.setStopWords(stopWordsFileName2); fieldDefinition2.setFieldName("field1"); List fieldDefinitionList2 = List.of(fieldDefinition2); - args.setFieldDefinition(fieldDefinitionList2); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + arguments.setFieldDefinition(fieldDefinitionList2); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(context, arguments)); return sparkStopWordsRemovers; } From e73bc3fdb9c494fa7951b4e34048a31ccaca83b9 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:46:46 +0530 Subject: [PATCH 168/219] renamed files --- .../common/core/block/TestBlockBase.java | 6 +- .../data/{Constant.java => TestData.java} | 128 +++++++++--------- ...maActual.java => PostStopWordProcess.java} | 6 +- ...riginal.java => PriorStopWordProcess.java} | 4 +- .../model/{Schema.java => Statement.java} | 4 +- .../core/preprocess/TestStopWordsBase.java | 28 ++-- 6 files changed, 88 insertions(+), 88 deletions(-) rename common/core/src/test/java/zingg/common/core/data/{Constant.java => TestData.java} (67%) rename common/core/src/test/java/zingg/common/core/model/{SchemaActual.java => PostStopWordProcess.java} (72%) rename common/core/src/test/java/zingg/common/core/model/{SchemaOriginal.java => PriorStopWordProcess.java} (70%) rename common/core/src/test/java/zingg/common/core/model/{Schema.java => Statement.java} (63%) diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index a8f746c47..3d4f440af 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -18,7 +18,7 @@ import zingg.common.core.util.HashUtil; import zingg.common.core.model.Event; import zingg.common.core.model.EventCluster; -import zingg.common.core.data.Constant; +import zingg.common.core.data.TestData; public abstract class TestBlockBase { @@ -37,8 +37,8 @@ public TestBlockBase(DFObjectUtil dfObjectUtil, HashUtil zFrameEvent = dfObjectUtil.getDFFromObjectList(Constant.createSampleEventData(), Event.class); - ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(Constant.createSampleClusterEventData(), EventCluster.class); + ZFrame zFrameEvent = dfObjectUtil.getDFFromObjectList(TestData.createSampleEventData(), Event.class); + ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(TestData.createSampleClusterEventData(), EventCluster.class); IArguments args = getArguments(); Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(zFrameEvent, zFrameEventCluster, 0.5, -1, diff --git a/common/core/src/test/java/zingg/common/core/data/Constant.java b/common/core/src/test/java/zingg/common/core/data/TestData.java similarity index 67% rename from common/core/src/test/java/zingg/common/core/data/Constant.java rename to common/core/src/test/java/zingg/common/core/data/TestData.java index 0418f9810..7418ac21c 100644 --- a/common/core/src/test/java/zingg/common/core/data/Constant.java +++ b/common/core/src/test/java/zingg/common/core/data/TestData.java @@ -2,14 +2,14 @@ import zingg.common.core.model.Event; import zingg.common.core.model.EventCluster; -import zingg.common.core.model.Schema; -import zingg.common.core.model.SchemaActual; -import zingg.common.core.model.SchemaOriginal; +import zingg.common.core.model.Statement; +import zingg.common.core.model.PostStopWordProcess; +import zingg.common.core.model.PriorStopWordProcess; import java.util.ArrayList; import java.util.List; -public class Constant { +public class TestData { public static List createSampleEventData() { int row_id = 1; @@ -136,127 +136,127 @@ public static List createSampleClusterEventData() { return sample; } - public static List getData1Original() { + public static List getData1Original() { - List sample = new ArrayList<>(); - sample.add(new Schema("The zingg is a Spark application")); - sample.add(new Schema("It is very popular in data Science")); - sample.add(new Schema("It is written in Java and Scala")); - sample.add(new Schema("Best of luck to zingg")); + List sample = new ArrayList<>(); + sample.add(new Statement("The zingg is a Spark application")); + sample.add(new Statement("It is very popular in data Science")); + sample.add(new Statement("It is written in Java and Scala")); + sample.add(new Statement("Best of luck to zingg")); return sample; } - public static List getData1Expected() { + public static List getData1Expected() { - List sample = new ArrayList<>(); - sample.add(new Schema("zingg spark application")); - sample.add(new Schema("very popular in data science")); - sample.add(new Schema("written in java and scala")); - sample.add(new Schema("best luck to zingg")); + List sample = new ArrayList<>(); + sample.add(new Statement("zingg spark application")); + sample.add(new Statement("very popular in data science")); + sample.add(new Statement("written in java and scala")); + sample.add(new Statement("best luck to zingg")); return sample; } - public static List getData2Original() { + public static List getData2Original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in Data Science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); return sample; } - public static List getData2Expected() { + public static List getData2Expected() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new PriorStopWordProcess("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new PriorStopWordProcess("30", "written java scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "best luck to zingg ", "Five", "thank you", "test")); return sample; } - public static List getData3Original() { + public static List getData3Original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in Data Science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); return sample; } - public static List getData3Expected() { + public static List getData3Expected() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new PriorStopWordProcess("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new PriorStopWordProcess("30", "written java scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "best luck to zingg ", "Five", "thank you", "test")); return sample; } - public static List getData4original() { + public static List getData4original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg", "Five", "thank you", "test")); return sample; } - public static List getData4Expected() { + public static List getData4Expected() { - List sample = new ArrayList<>(); - sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + List sample = new ArrayList<>(); + sample.add(new PostStopWordProcess("1648811730857:10", "10", "1.0", "0.555555", "-1", "The zingg spark application", "two", "Yes. good application", "test")); - sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + sample.add(new PostStopWordProcess("1648811730857:20", "20", "1.0", "1.0", "-1", "It very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + sample.add(new PostStopWordProcess("1648811730857:30", "30", "1.0", "0.999995", "-1", "It written java scala", "four", "", "test")); - sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + sample.add(new PostStopWordProcess("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", "thank", "test")); return sample; } - public static List getData5Original() { + public static List getData5Original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg", "Five", "thank you", "test")); return sample; } - public static List getData5Actual() { + public static List getData5Actual() { - List sample = new ArrayList<>(); - sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + List sample = new ArrayList<>(); + sample.add(new PostStopWordProcess("1648811730857:10", "10", "1.0", "0.555555", "-1", "The zingg spark application", "two", "Yes. good application", "test")); - sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + sample.add(new PostStopWordProcess("1648811730857:20", "20", "1.0", "1.0", "-1", "It very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + sample.add(new PostStopWordProcess("1648811730857:30", "30", "1.0", "0.999995", "-1", "It written java scala", "four", "", "test")); - sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + sample.add(new PostStopWordProcess("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", "thank", "test")); return sample; diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaActual.java b/common/core/src/test/java/zingg/common/core/model/PostStopWordProcess.java similarity index 72% rename from common/core/src/test/java/zingg/common/core/model/SchemaActual.java rename to common/core/src/test/java/zingg/common/core/model/PostStopWordProcess.java index 762420435..c137fa559 100644 --- a/common/core/src/test/java/zingg/common/core/model/SchemaActual.java +++ b/common/core/src/test/java/zingg/common/core/model/PostStopWordProcess.java @@ -1,6 +1,6 @@ package zingg.common.core.model; -public class SchemaActual { +public class PostStopWordProcess { public final String z_cluster; public final String z_zid; public final String z_prediction; @@ -11,8 +11,8 @@ public class SchemaActual { public final String field3; public final String z_zsource; - public SchemaActual(String z_cluster, String z_zid, String z_prediction, String z_score, String z_isMatch, - String field1, String field2, String field3, String z_zsource) { + public PostStopWordProcess(String z_cluster, String z_zid, String z_prediction, String z_score, String z_isMatch, + String field1, String field2, String field3, String z_zsource) { this.z_cluster = z_cluster; this.z_zid = z_zid; this.z_prediction = z_prediction; diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java b/common/core/src/test/java/zingg/common/core/model/PriorStopWordProcess.java similarity index 70% rename from common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java rename to common/core/src/test/java/zingg/common/core/model/PriorStopWordProcess.java index 25d55eca2..502a8ef2b 100644 --- a/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java +++ b/common/core/src/test/java/zingg/common/core/model/PriorStopWordProcess.java @@ -1,13 +1,13 @@ package zingg.common.core.model; -public class SchemaOriginal { +public class PriorStopWordProcess { public final String z_zid; public final String field1; public final String field2; public final String field3; public final String z_zsource; - public SchemaOriginal(String z_zid, String field1, String field2, String field3, String z_zsource) { + public PriorStopWordProcess(String z_zid, String field1, String field2, String field3, String z_zsource) { this.z_zid = z_zid; this.field1 = field1; this.field2 = field2; diff --git a/common/core/src/test/java/zingg/common/core/model/Schema.java b/common/core/src/test/java/zingg/common/core/model/Statement.java similarity index 63% rename from common/core/src/test/java/zingg/common/core/model/Schema.java rename to common/core/src/test/java/zingg/common/core/model/Statement.java index c608bd370..1fabf51ef 100644 --- a/common/core/src/test/java/zingg/common/core/model/Schema.java +++ b/common/core/src/test/java/zingg/common/core/model/Statement.java @@ -1,9 +1,9 @@ package zingg.common.core.model; -public class Schema { +public class Statement { public final String statement; - public Schema(String statement) { + public Statement(String statement) { this.statement = statement; } } diff --git a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index ca14ef979..51ff098b9 100644 --- a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -15,10 +15,10 @@ import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.core.context.Context; -import zingg.common.core.data.Constant; -import zingg.common.core.model.Schema; -import zingg.common.core.model.SchemaActual; -import zingg.common.core.model.SchemaOriginal; +import zingg.common.core.data.TestData; +import zingg.common.core.model.Statement; +import zingg.common.core.model.PostStopWordProcess; +import zingg.common.core.model.PriorStopWordProcess; import zingg.common.core.util.StopWordRemoverUtility; public abstract class TestStopWordsBase { @@ -42,8 +42,8 @@ public void testStopWordsSingleColumn() throws ZinggClientException, Exception { String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData1Original(), Statement.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData1Expected(), Statement.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(0); @@ -57,8 +57,8 @@ public void testStopWordsSingleColumn() throws ZinggClientException, Exception { @Test public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData2Original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData2Expected(), SchemaOriginal.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData2Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData2Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(1); ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); @@ -70,8 +70,8 @@ public void testRemoveStopWordsFromDataset() throws ZinggClientException, Except @Test public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData3Original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData3Expected(), SchemaOriginal.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData3Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); @@ -84,8 +84,8 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept @Test public void testForOriginalDataAfterPostProcess() throws Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData4original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData4Expected(), SchemaActual.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData4original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData4Expected(), PostStopWordProcess.class); ZFrame newZFrame = context.getDSUtil().postprocess(zFrameExpected, zFrameOriginal); @@ -96,8 +96,8 @@ public void testForOriginalDataAfterPostProcess() throws Exception { @Test public void testOriginalDataAfterPostProcessLinked() throws Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData5Original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData5Actual(), SchemaActual.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData5Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData5Actual(), PostStopWordProcess.class); ZFrame newZFrame = context.getDSUtil().postprocessLinked(zFrameExpected, zFrameOriginal); From 91d92ffb038439203fbc67f33f52bb474e8d188c Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 12:30:15 +0530 Subject: [PATCH 169/219] renamed newDataSet to newZFrame --- .../zingg/common/core/preprocess/TestStopWordsBase.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index 51ff098b9..b8210db03 100644 --- a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -74,10 +74,10 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); - ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); + ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - assertTrue(zFrameExpected.except(newDataSet).isEmpty()); - assertTrue(newDataSet.except(zFrameExpected).isEmpty()); + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); } From 34934705f25c51b34f51d5c62f0bfdf4d18d97ec Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 8 Jul 2024 19:45:01 +0530 Subject: [PATCH 170/219] generalized block test --- .../test/java/zingg/block/TestBlockBase.java | 101 +++++++ .../src/test/java/zingg/data/Constant.java | 135 +++++++++ .../core/src/test/java/zingg/model/Event.java | 15 + .../test/java/zingg/model/EventCluster.java | 23 ++ .../src/test/java/zingg/block/TestBlock.java | 256 ------------------ .../test/java/zingg/block/TestSparkBlock.java | 69 +++++ 6 files changed, 343 insertions(+), 256 deletions(-) create mode 100644 common/core/src/test/java/zingg/block/TestBlockBase.java create mode 100644 common/core/src/test/java/zingg/data/Constant.java create mode 100644 common/core/src/test/java/zingg/model/Event.java create mode 100644 common/core/src/test/java/zingg/model/EventCluster.java delete mode 100644 spark/core/src/test/java/zingg/block/TestBlock.java create mode 100644 spark/core/src/test/java/zingg/block/TestSparkBlock.java diff --git a/common/core/src/test/java/zingg/block/TestBlockBase.java b/common/core/src/test/java/zingg/block/TestBlockBase.java new file mode 100644 index 000000000..9f1593790 --- /dev/null +++ b/common/core/src/test/java/zingg/block/TestBlockBase.java @@ -0,0 +1,101 @@ +package zingg.block; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import zingg.common.client.ArgumentsUtil; +import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; +import zingg.common.client.MatchType; +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.DFObjectUtil; +import zingg.common.core.block.Canopy; +import zingg.common.core.block.Tree; +import zingg.common.core.util.BlockingTreeUtil; +import zingg.common.core.util.HashUtil; +import zingg.model.Event; +import zingg.model.EventCluster; +import zingg.data.Constant; + +public abstract class TestBlockBase { + + public ArgumentsUtil argumentsUtil = new ArgumentsUtil(); + public final DFObjectUtil dfObjectUtil; + public final HashUtil hashUtil; + public final BlockingTreeUtil blockingTreeUtil; + + public TestBlockBase(DFObjectUtil dfObjectUtil, HashUtil hashUtil, BlockingTreeUtil blockingTreeUtil) { + this.dfObjectUtil = dfObjectUtil; + this.hashUtil = hashUtil; + this.blockingTreeUtil = blockingTreeUtil; + } + + @Test + public void testTree() throws Throwable { + + // form tree + ZFrame zFrameEvent = dfObjectUtil.getDFFromObjectList(Constant.createSampleEventData(), Event.class); + ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(Constant.createSampleClusterEventData(), EventCluster.class); + IArguments args = getArguments(); + + Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(zFrameEvent, zFrameEventCluster, 0.5, -1, + args, hashUtil.getHashFunctionList()); + + // primary deciding is unique year so identityInteger should have been picked + Canopy head = blockingTree.getHead(); + assertEquals("identityInteger", head.getFunction().getName()); + blockingTree.toString(); + } + + private IArguments getArguments() throws ZinggClientException { + String configFilePath = getClass().getResource("../../testFebrl/config.json").getFile(); + + IArguments args = argumentsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); + + List fdList = getFieldDefList(); + + args.setFieldDefinition(fdList); + return args; + } + + private List getFieldDefList() { + List fdList = new ArrayList(4); + + FieldDefinition idFD = new FieldDefinition(); + idFD.setDataType("integer"); + idFD.setFieldName("id"); + ArrayList matchTypelistId = new ArrayList(); + matchTypelistId.add(MatchType.DONT_USE); + idFD.setMatchType(matchTypelistId); + fdList.add(idFD); + + ArrayList matchTypelistFuzzy = new ArrayList(); + matchTypelistFuzzy.add(MatchType.FUZZY); + + + FieldDefinition yearFD = new FieldDefinition(); + yearFD.setDataType("integer"); + yearFD.setFieldName("year"); + yearFD.setMatchType(matchTypelistFuzzy); + fdList.add(yearFD); + + FieldDefinition eventFD = new FieldDefinition(); + eventFD.setDataType("string"); + eventFD.setFieldName("event"); + eventFD.setMatchType(matchTypelistFuzzy); + fdList.add(eventFD); + + FieldDefinition commentFD = new FieldDefinition(); + commentFD.setDataType("string"); + commentFD.setFieldName("comment"); + commentFD.setMatchType(matchTypelistFuzzy); + fdList.add(commentFD); + return fdList; + } + +} diff --git a/common/core/src/test/java/zingg/data/Constant.java b/common/core/src/test/java/zingg/data/Constant.java new file mode 100644 index 000000000..8d8af0e40 --- /dev/null +++ b/common/core/src/test/java/zingg/data/Constant.java @@ -0,0 +1,135 @@ +package zingg.data; + +import zingg.model.Event; +import zingg.model.EventCluster; + +import java.util.ArrayList; +import java.util.List; + +public class Constant { + public static List createSampleEventData() { + + int row_id = 1; + List sample = new ArrayList<>(); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add((new Event(row_id++, 1942, "quit N", "Mahatma"))); + sample.add((new Event(row_id++, 1919, "JallianWal", "Punjb"))); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id++, 1942, "quit Natin", "Mahaatma")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "Quit Bharat", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidence", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit N", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id++, 1942, "quit Natin", "Mahaatma")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "Quit Bharat", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidence", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit N", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id++, 1942, "quit Natin", "Mahaatma")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); + sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidience", "India")); + sample.add(new Event(row_id++, 1942, "Quit Bharat", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); + sample.add(new Event(row_id++, 1930, "Civil Disobidence", "India")); + sample.add(new Event(row_id++, 1942, "quit Hindustan", "Mahatma Gandhi")); + sample.add(new Event(row_id++, 1919, "JW", "Amritsar")); + sample.add(new Event(row_id++, 1930, "Civil Dis", "India")); + sample.add(new Event(row_id++, 1942, "quit Nation", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit N", "Mahatma")); + sample.add(new Event(row_id++, 1919, "JallianWal", "Punjb")); + sample.add(new Event(row_id++, 1942, "quit ", "Mahatm")); + sample.add(new Event(row_id++, 1942, "quit Ntn", "Mahama")); + sample.add(new Event(row_id, 1942, "quit Natin", "Mahaatma")); + + return sample; + } + + public static List createSampleClusterEventData() { + + int row_id = 1; + List sample = new ArrayList<>(); + sample.add(new EventCluster(row_id++, 1942, "quit Nation", "Mahatma",1942, "quit Nation", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma",1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventCluster(row_id, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + + return sample; + } +} diff --git a/common/core/src/test/java/zingg/model/Event.java b/common/core/src/test/java/zingg/model/Event.java new file mode 100644 index 000000000..c7feb391d --- /dev/null +++ b/common/core/src/test/java/zingg/model/Event.java @@ -0,0 +1,15 @@ +package zingg.model; + +public class Event { + public final Integer id; + public final Integer year; + public final String event; + public final String comment; + + public Event(Integer id, Integer year, String event, String comment) { + this.id = id; + this.year = year; + this.event = event; + this.comment = comment; + } +} diff --git a/common/core/src/test/java/zingg/model/EventCluster.java b/common/core/src/test/java/zingg/model/EventCluster.java new file mode 100644 index 000000000..7cca1a082 --- /dev/null +++ b/common/core/src/test/java/zingg/model/EventCluster.java @@ -0,0 +1,23 @@ +package zingg.model; + +public class EventCluster { + public final Integer id; + public final Integer year; + public final String event; + public final String comment; + public final Integer z_year; + public final String z_event; + public final String z_comment; + public final Long z_zid; + + public EventCluster(Integer id, Integer year, String event, String comment, Integer z_year, String z_event, String z_comment, Long z_zid) { + this.id = id; + this.year = year; + this.event = event; + this.comment = comment; + this.z_year = z_year; + this.z_event = z_event; + this.z_comment = z_comment; + this.z_zid = z_zid; + } +} \ No newline at end of file diff --git a/spark/core/src/test/java/zingg/block/TestBlock.java b/spark/core/src/test/java/zingg/block/TestBlock.java deleted file mode 100644 index 17cbdb93a..000000000 --- a/spark/core/src/test/java/zingg/block/TestBlock.java +++ /dev/null @@ -1,256 +0,0 @@ -package zingg.block; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.junit.jupiter.api.Test; - -import zingg.common.client.ArgumentsUtil; -import zingg.common.client.FieldDefinition; -import zingg.common.client.IArguments; -import zingg.common.client.MatchType; -import zingg.common.client.ZFrame; -import zingg.common.client.ZinggClientException; -import zingg.common.core.block.Canopy; -import zingg.common.core.block.Tree; -import zingg.spark.client.SparkFrame; -import zingg.spark.core.executor.ZinggSparkTester; -import zingg.spark.core.util.SparkBlockingTreeUtil; -import zingg.spark.core.util.SparkHashUtil; - -public class TestBlock extends ZinggSparkTester { - - @Test - public void testTree() throws Throwable { - - ZFrame, Row, Column> testData = getTestData(); - - ZFrame, Row, Column> posDf = getPosData(); - - IArguments args = getArguments(); - - // form tree - SparkBlockingTreeUtil blockingTreeUtil = new SparkBlockingTreeUtil(spark, zsCTX.getPipeUtil()); - SparkHashUtil hashUtil = new SparkHashUtil(spark); - - Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(testData, posDf, 0.5, -1, - args, hashUtil.getHashFunctionList()); - - // primary deciding is unique year so identityInteger should have been picked - Canopy head = blockingTree.getHead(); - assertEquals("identityInteger", head.getFunction().getName()); - blockingTree.toString(); - - } - - StructType testDataSchema = new StructType(new StructField[] { - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("year", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("event", DataTypes.StringType, false, Metadata.empty()), - new StructField("comment", DataTypes.StringType, false, Metadata.empty())} - ); - - StructType schemaPos = new StructType(new StructField[] { - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("year", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("event", DataTypes.StringType, false, Metadata.empty()), - new StructField("comment", DataTypes.StringType, false, Metadata.empty()), - new StructField("z_year", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("z_event", DataTypes.StringType, false, Metadata.empty()), - new StructField("z_comment", DataTypes.StringType, false, Metadata.empty()), - new StructField("z_zid", DataTypes.StringType, false, Metadata.empty())} - ); - - - - - private IArguments getArguments() throws ZinggClientException { - String configFilePath = getClass().getResource("../../testFebrl/config.json").getFile(); - - IArguments args = argsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); - - List fdList = getFieldDefList(); - - args.setFieldDefinition(fdList); - return args; - } - - private List getFieldDefList() { - List fdList = new ArrayList(4); - - FieldDefinition idFD = new FieldDefinition(); - idFD.setDataType("integer"); - idFD.setFieldName("id"); - ArrayList matchTypelistId = new ArrayList(); - matchTypelistId.add(MatchType.DONT_USE); - idFD.setMatchType(matchTypelistId); - fdList.add(idFD); - - ArrayList matchTypelistFuzzy = new ArrayList(); - matchTypelistFuzzy.add(MatchType.FUZZY); - - - FieldDefinition yearFD = new FieldDefinition(); - yearFD.setDataType("integer"); - yearFD.setFieldName("year"); - yearFD.setMatchType(matchTypelistFuzzy); - fdList.add(yearFD); - - FieldDefinition eventFD = new FieldDefinition(); - eventFD.setDataType("string"); - eventFD.setFieldName("event"); - eventFD.setMatchType(matchTypelistFuzzy); - fdList.add(eventFD); - - FieldDefinition commentFD = new FieldDefinition(); - commentFD.setDataType("string"); - commentFD.setFieldName("comment"); - commentFD.setMatchType(matchTypelistFuzzy); - fdList.add(commentFD); - return fdList; - } - - public SparkFrame getTestData() { - int row_id = 1; - // Create a DataFrame containing test data - Row[] data = { - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disob", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit India", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidience", "India"), - RowFactory.create(row_id++, new Integer(1942), "Quit Bharat", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JallianWala", "Punjab"), - RowFactory.create(row_id++, new Integer(1930), "Civil Disobidence", "India"), - RowFactory.create(row_id++, new Integer(1942), "quit Hindustan", "Mahatma Gandhi"), - RowFactory.create(row_id++, new Integer(1919), "JW", "Amritsar"), - RowFactory.create(row_id++, new Integer(1930), "Civil Dis", "India") , - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma") - }; - - return new SparkFrame( - spark.createDataFrame(Arrays.asList(data), - testDataSchema)); - - } - - private SparkFrame getPosData() { - int row_id = 1000; - // Create positive matching data - Row[] posData = { - RowFactory.create(row_id++, new Integer(1942), "quit Nation", "Mahatma",new Integer(1942), "quit Nation", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit N", "Mahatma",new Integer(1942), "quit N", "Mahatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2"), - RowFactory.create(row_id++, new Integer(1942), "quit ", "Mahatm",new Integer(1942), "quit ", "Mahatm", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Ntn", "Mahama",new Integer(1942), "quit Ntn", "Mahama", "1"), - RowFactory.create(row_id++, new Integer(1942), "quit Natin", "Mahaatma",new Integer(1942), "quit Natin", "Mahaatma", "1"), - RowFactory.create(row_id++, new Integer(1919), "JallianWal", "Punjb",new Integer(1919), "JallianWal", "Punjb", "2") - }; - return new SparkFrame(spark.createDataFrame(Arrays.asList(posData), schemaPos)); - } - - -} diff --git a/spark/core/src/test/java/zingg/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/block/TestSparkBlock.java new file mode 100644 index 000000000..108973ba3 --- /dev/null +++ b/spark/core/src/test/java/zingg/block/TestSparkBlock.java @@ -0,0 +1,69 @@ +package zingg.block; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import zingg.common.client.IArguments; +import zingg.common.client.util.WithSession; +import zingg.spark.client.util.SparkDFObjectUtil; +import zingg.spark.client.util.WithSparkSession; +import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.util.SparkBlockingTreeUtil; +import zingg.spark.core.util.SparkHashUtil; + +public class TestSparkBlock extends TestBlockBase, Row, Column, DataType> { + + public static final Log LOG = LogFactory.getLog(TestSparkBlock.class); + public static IArguments args; + public static JavaSparkContext ctx; + public static ZinggSparkContext zsCTX; + public static SparkSession spark; + public static WithSession withSession; + + public TestSparkBlock() { + super(new SparkDFObjectUtil(withSession), new SparkHashUtil(spark), new SparkBlockingTreeUtil(spark, zsCTX.getPipeUtil())); + } + + @BeforeAll + public static void setup() { + setUpSpark(); + } + + protected static void setUpSpark() { + try { + spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + ctx = new JavaSparkContext(spark.sparkContext()); + withSession = new WithSparkSession(); + withSession.setSession(spark); + zsCTX = new ZinggSparkContext(); + zsCTX.init(spark); + } catch (Throwable e) { + if (LOG.isDebugEnabled()) + e.printStackTrace(); + LOG.info("Problem in spark env setup"); + } + } + + @AfterAll + public static void teardown() { + if (ctx != null) { + ctx.stop(); + ctx = null; + } + if (spark != null) { + spark.stop(); + spark = null; + } + } +} From 42e607967ef3092a6b8f2bbbb532c980d7978560 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 12:40:22 +0530 Subject: [PATCH 171/219] restructured packaging --- .../java/zingg/{ => common/core}/block/TestBlockBase.java | 4 +--- .../test/java/zingg/{ => common/core}/block/TestTree.java | 6 +----- .../java/zingg/{ => common/core}/block/TestSparkBlock.java | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) rename common/core/src/test/java/zingg/{ => common/core}/block/TestBlockBase.java (97%) rename common/core/src/test/java/zingg/{ => common/core}/block/TestTree.java (79%) rename spark/core/src/test/java/zingg/{ => common/core}/block/TestSparkBlock.java (98%) diff --git a/common/core/src/test/java/zingg/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java similarity index 97% rename from common/core/src/test/java/zingg/block/TestBlockBase.java rename to common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 9f1593790..18288f2c3 100644 --- a/common/core/src/test/java/zingg/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -1,4 +1,4 @@ -package zingg.block; +package zingg.common.core.block; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -14,8 +14,6 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.DFObjectUtil; -import zingg.common.core.block.Canopy; -import zingg.common.core.block.Tree; import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.HashUtil; import zingg.model.Event; diff --git a/common/core/src/test/java/zingg/block/TestTree.java b/common/core/src/test/java/zingg/common/core/block/TestTree.java similarity index 79% rename from common/core/src/test/java/zingg/block/TestTree.java rename to common/core/src/test/java/zingg/common/core/block/TestTree.java index 81d5044b6..93898c105 100644 --- a/common/core/src/test/java/zingg/block/TestTree.java +++ b/common/core/src/test/java/zingg/common/core/block/TestTree.java @@ -1,11 +1,7 @@ -package zingg.block; +package zingg.common.core.block; import org.junit.jupiter.api.*; -import zingg.common.core.block.Tree; - -import static org.junit.jupiter.api.Assertions.*; - public class TestTree { @Test diff --git a/spark/core/src/test/java/zingg/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java similarity index 98% rename from spark/core/src/test/java/zingg/block/TestSparkBlock.java rename to spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java index 108973ba3..26f5a1652 100644 --- a/spark/core/src/test/java/zingg/block/TestSparkBlock.java +++ b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java @@ -1,4 +1,4 @@ -package zingg.block; +package zingg.common.core.block; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; From 679f2d0aa04950de113c3fd86e91271e63ef1cf1 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 13:03:02 +0530 Subject: [PATCH 172/219] created model classes for testing stop words --- .../common/core/block/TestBlockBase.java | 6 +- .../{ => common/core}/data/Constant.java | 64 ++++++++++++++++++- .../zingg/{ => common/core}/model/Event.java | 2 +- .../{ => common/core}/model/EventCluster.java | 2 +- .../zingg/common/core/model/SchemaActual.java | 26 ++++++++ .../common/core/model/SchemaOriginal.java | 17 +++++ .../core/preprocess/TestSparkStopWords.java | 4 ++ 7 files changed, 113 insertions(+), 8 deletions(-) rename common/core/src/test/java/zingg/{ => common/core}/data/Constant.java (76%) rename common/core/src/test/java/zingg/{ => common/core}/model/Event.java (91%) rename common/core/src/test/java/zingg/{ => common/core}/model/EventCluster.java (95%) create mode 100644 common/core/src/test/java/zingg/common/core/model/SchemaActual.java create mode 100644 common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java create mode 100644 spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 18288f2c3..a8f746c47 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -16,9 +16,9 @@ import zingg.common.client.util.DFObjectUtil; import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.HashUtil; -import zingg.model.Event; -import zingg.model.EventCluster; -import zingg.data.Constant; +import zingg.common.core.model.Event; +import zingg.common.core.model.EventCluster; +import zingg.common.core.data.Constant; public abstract class TestBlockBase { diff --git a/common/core/src/test/java/zingg/data/Constant.java b/common/core/src/test/java/zingg/common/core/data/Constant.java similarity index 76% rename from common/core/src/test/java/zingg/data/Constant.java rename to common/core/src/test/java/zingg/common/core/data/Constant.java index 8d8af0e40..eee323c70 100644 --- a/common/core/src/test/java/zingg/data/Constant.java +++ b/common/core/src/test/java/zingg/common/core/data/Constant.java @@ -1,7 +1,9 @@ -package zingg.data; +package zingg.common.core.data; -import zingg.model.Event; -import zingg.model.EventCluster; +import zingg.common.core.model.Event; +import zingg.common.core.model.EventCluster; +import zingg.common.core.model.SchemaActual; +import zingg.common.core.model.SchemaOriginal; import java.util.ArrayList; import java.util.List; @@ -132,4 +134,60 @@ public static List createSampleClusterEventData() { return sample; } + + public static List getData1Original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + + return sample; + } + + public static List getData1Actual() { + + List sample = new ArrayList<>(); + sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + "The zingg spark application", "two", "Yes. good application", "test")); + sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + "It very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + "It written java scala", "four", "", "test")); + sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + "thank", "test")); + + return sample; + } + + public static List getData2original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + + return sample; + } + + public static List getData2Actual() { + + List sample = new ArrayList<>(); + sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + "The zingg spark application", "two", "Yes. good application", "test")); + sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + "It very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + "It written java scala", "four", "", "test")); + sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + "thank", "test")); + + return sample; + } } diff --git a/common/core/src/test/java/zingg/model/Event.java b/common/core/src/test/java/zingg/common/core/model/Event.java similarity index 91% rename from common/core/src/test/java/zingg/model/Event.java rename to common/core/src/test/java/zingg/common/core/model/Event.java index c7feb391d..0bad3e883 100644 --- a/common/core/src/test/java/zingg/model/Event.java +++ b/common/core/src/test/java/zingg/common/core/model/Event.java @@ -1,4 +1,4 @@ -package zingg.model; +package zingg.common.core.model; public class Event { public final Integer id; diff --git a/common/core/src/test/java/zingg/model/EventCluster.java b/common/core/src/test/java/zingg/common/core/model/EventCluster.java similarity index 95% rename from common/core/src/test/java/zingg/model/EventCluster.java rename to common/core/src/test/java/zingg/common/core/model/EventCluster.java index 7cca1a082..f4697cf28 100644 --- a/common/core/src/test/java/zingg/model/EventCluster.java +++ b/common/core/src/test/java/zingg/common/core/model/EventCluster.java @@ -1,4 +1,4 @@ -package zingg.model; +package zingg.common.core.model; public class EventCluster { public final Integer id; diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaActual.java b/common/core/src/test/java/zingg/common/core/model/SchemaActual.java new file mode 100644 index 000000000..762420435 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/SchemaActual.java @@ -0,0 +1,26 @@ +package zingg.common.core.model; + +public class SchemaActual { + public final String z_cluster; + public final String z_zid; + public final String z_prediction; + public final String z_score; + public final String z_isMatch; + public final String field1; + public final String field2; + public final String field3; + public final String z_zsource; + + public SchemaActual(String z_cluster, String z_zid, String z_prediction, String z_score, String z_isMatch, + String field1, String field2, String field3, String z_zsource) { + this.z_cluster = z_cluster; + this.z_zid = z_zid; + this.z_prediction = z_prediction; + this.z_score = z_score; + this.z_isMatch = z_isMatch; + this.field1 = field1; + this.field2 = field2; + this.field3 = field3; + this.z_zsource = z_zsource; + } +} diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java b/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java new file mode 100644 index 000000000..25d55eca2 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java @@ -0,0 +1,17 @@ +package zingg.common.core.model; + +public class SchemaOriginal { + public final String z_zid; + public final String field1; + public final String field2; + public final String field3; + public final String z_zsource; + + public SchemaOriginal(String z_zid, String field1, String field2, String field3, String z_zsource) { + this.z_zid = z_zid; + this.field1 = field1; + this.field2 = field2; + this.field3 = field3; + this.z_zsource = z_zsource; + } +} diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java new file mode 100644 index 000000000..d7eaf27a4 --- /dev/null +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -0,0 +1,4 @@ +package zingg.common.core.preprocess; + +public class TestSparkStopWords { +} From 59d2c22a86c6ed9e870ad46449ceed9e06c6cc49 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 14:15:11 +0530 Subject: [PATCH 173/219] initial changes --- .../java/zingg/common/core/data/Constant.java | 79 +++++++- .../java/zingg/common/core/model/Schema.java | 9 + .../common/core/preprocess/TestStopWords.java | 173 ++++++++++-------- 3 files changed, 184 insertions(+), 77 deletions(-) create mode 100644 common/core/src/test/java/zingg/common/core/model/Schema.java diff --git a/common/core/src/test/java/zingg/common/core/data/Constant.java b/common/core/src/test/java/zingg/common/core/data/Constant.java index eee323c70..0418f9810 100644 --- a/common/core/src/test/java/zingg/common/core/data/Constant.java +++ b/common/core/src/test/java/zingg/common/core/data/Constant.java @@ -2,6 +2,7 @@ import zingg.common.core.model.Event; import zingg.common.core.model.EventCluster; +import zingg.common.core.model.Schema; import zingg.common.core.model.SchemaActual; import zingg.common.core.model.SchemaOriginal; @@ -135,7 +136,77 @@ public static List createSampleClusterEventData() { return sample; } - public static List getData1Original() { + public static List getData1Original() { + + List sample = new ArrayList<>(); + sample.add(new Schema("The zingg is a Spark application")); + sample.add(new Schema("It is very popular in data Science")); + sample.add(new Schema("It is written in Java and Scala")); + sample.add(new Schema("Best of luck to zingg")); + + return sample; + } + + public static List getData1Expected() { + + List sample = new ArrayList<>(); + sample.add(new Schema("zingg spark application")); + sample.add(new Schema("very popular in data science")); + sample.add(new Schema("written in java and scala")); + sample.add(new Schema("best luck to zingg")); + + return sample; + } + + public static List getData2Original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + + return sample; + } + + public static List getData2Expected() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + + return sample; + } + + public static List getData3Original() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + "test")); + sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + + return sample; + } + + public static List getData3Expected() { + + List sample = new ArrayList<>(); + sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); + sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + + return sample; + } + + public static List getData4original() { List sample = new ArrayList<>(); sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", @@ -148,7 +219,7 @@ public static List getData1Original() { return sample; } - public static List getData1Actual() { + public static List getData4Expected() { List sample = new ArrayList<>(); sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", @@ -163,7 +234,7 @@ public static List getData1Actual() { return sample; } - public static List getData2original() { + public static List getData5Original() { List sample = new ArrayList<>(); sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", @@ -176,7 +247,7 @@ public static List getData2original() { return sample; } - public static List getData2Actual() { + public static List getData5Actual() { List sample = new ArrayList<>(); sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", diff --git a/common/core/src/test/java/zingg/common/core/model/Schema.java b/common/core/src/test/java/zingg/common/core/model/Schema.java new file mode 100644 index 000000000..c608bd370 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/Schema.java @@ -0,0 +1,9 @@ +package zingg.common.core.model; + +public class Schema { + public final String statement; + + public Schema(String statement) { + this.statement = statement; + } +} diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java index c9ace5f3f..d26c3db4a 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java @@ -19,44 +19,62 @@ import org.junit.jupiter.api.Test; import zingg.common.client.Arguments; +import zingg.common.client.ArgumentsUtil; import zingg.common.client.FieldDefinition; import zingg.common.client.IArguments; import zingg.common.client.MatchType; +import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; +import zingg.common.client.util.DFObjectUtil; +import zingg.common.core.data.Constant; +import zingg.common.core.model.Event; +import zingg.common.core.model.Schema; +import zingg.common.core.model.SchemaOriginal; import zingg.spark.client.SparkFrame; import zingg.spark.core.executor.ZinggSparkTester; import zingg.spark.core.preprocess.SparkStopWordsRemover; -public class TestStopWords extends ZinggSparkTester{ +public class TestStopWords{ public static final Log LOG = LogFactory.getLog(TestStopWords.class); + public final DFObjectUtil dfObjectUtil; + public final StopWordsRemover stopWordsRemover; + public IArguments args = new Arguments(); + + public TestStopWords(DFObjectUtil dfObjectUtil, StopWordsRemover stopWordsRemover) { + this.dfObjectUtil = dfObjectUtil; + this.stopWordsRemover = stopWordsRemover; + } @DisplayName ("Test Stop Words removal from Single column dataset") @Test - public void testStopWordsSingleColumn() throws ZinggClientException { + public void testStopWordsSingleColumn() throws ZinggClientException, Exception { - StructType schema = new StructType(new StructField[] { - new StructField("statement", DataTypes.StringType, false, Metadata.empty()) - }); - - Dataset datasetOriginal = spark.createDataFrame( - Arrays.asList( - RowFactory.create("The zingg is a Spark application"), - RowFactory.create("It is very popular in data Science"), - RowFactory.create("It is written in Java and Scala"), - RowFactory.create("Best of luck to zingg")), - schema); +// StructType schema = new StructType(new StructField[] { +// new StructField("statement", DataTypes.StringType, false, Metadata.empty()) +// }); +// +// Dataset datasetOriginal = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("The zingg is a Spark application"), +// RowFactory.create("It is very popular in data Science"), +// RowFactory.create("It is written in Java and Scala"), +// RowFactory.create("Best of luck to zingg")), +// schema); String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); - Dataset datasetExpected = spark.createDataFrame( - Arrays.asList( - RowFactory.create("zingg spark application"), - RowFactory.create("very popular in data science"), - RowFactory.create("written in java and scala"), - RowFactory.create("best luck to zingg")), - schema); +// Dataset datasetExpected = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("zingg spark application"), +// RowFactory.create("very popular in data science"), +// RowFactory.create("written in java and scala"), +// RowFactory.create("best luck to zingg")), +// schema); + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); List fdList = new ArrayList(4); @@ -72,63 +90,68 @@ public void testStopWordsSingleColumn() throws ZinggClientException { IArguments stmtArgs = new Arguments(); stmtArgs.setFieldDefinition(fdList); - StopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,stmtArgs); +// StopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,stmtArgs); - stopWordsObj.preprocessForStopWords(new SparkFrame(datasetOriginal)); + stopWordsRemover.preprocessForStopWords(zFrameOriginal); System.out.println("datasetOriginal.show() : "); - datasetOriginal.show(); - SparkFrame datasetWithoutStopWords = (SparkFrame)stopWordsObj.removeStopWordsFromDF(new SparkFrame(datasetOriginal),"statement",stopWords); + zFrameOriginal.show(); + ZFrame zFrameWithoutStopWords = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); System.out.println("datasetWithoutStopWords.show() : "); - datasetWithoutStopWords.show(); + zFrameWithoutStopWords.show(); - assertTrue(datasetExpected.except(datasetWithoutStopWords.df()).isEmpty()); - assertTrue(datasetWithoutStopWords.df().except(datasetExpected).isEmpty()); +// assertTrue(zFrameExpected.except(datasetWithoutStopWords.df()).isEmpty()); +// assertTrue(datasetWithoutStopWords.df().except(datasetExpected).isEmpty()); } @Test - public void testRemoveStopWordsFromDataset() throws ZinggClientException { - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); + public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { +// StructType schemaOriginal = new StructType(new StructField[] { +// new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), +// new StructField("field1", DataTypes.StringType, false, Metadata.empty()), +// new StructField("field2", DataTypes.StringType, false, Metadata.empty()), +// new StructField("field3", DataTypes.StringType, false, Metadata.empty()), +// new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) +// }); - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), - schemaOriginal); +// Dataset original = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "The zingg is a spark application", "two", +// "Yes. a good application", "test"), +// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", +// "test"), +// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), +// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), +// schemaOriginal); - Dataset datasetExpected = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), - RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), - RowFactory.create("30", "written java scala", "four", "", "test"), - RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), - schemaOriginal); +// Dataset datasetExpected = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), +// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), +// RowFactory.create("30", "written java scala", "four", "", "test"), +// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), +// schemaOriginal); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData2Original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData2Expected(), SchemaOriginal.class); + String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWords.csv").getFile(); FieldDefinition fd = new FieldDefinition(); fd.setStopWords(stopWordsFileName); fd.setFieldName("field1"); List fieldDefinitionList = Arrays.asList(fd); - args.setFieldDefinition(fieldDefinitionList); +// args.setFieldDefinition(fieldDefinitionList); - SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); +// SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); + + ZFrame zFrameNew = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - Dataset newDataSet = ((SparkFrame)(stopWordsObj.preprocessForStopWords(new SparkFrame(original)))).df(); - assertTrue(datasetExpected.except(newDataSet).isEmpty()); - assertTrue(newDataSet.except(datasetExpected).isEmpty()); +// Dataset newDataSet = ((SparkFrame)(stopWordsRemover.preprocessForStopWords(new SparkFrame(original)))).df(); +// assertTrue(datasetExpected.except(newDataSet).isEmpty()); +// assertTrue(newDataSet.except(datasetExpected).isEmpty()); } @Test - public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException { + public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { StructType schemaOriginal = new StructType(new StructField[] { new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), new StructField("field1", DataTypes.StringType, false, Metadata.empty()), @@ -137,23 +160,27 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) }); - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), - schemaOriginal); +// Dataset original = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "The zingg is a spark application", "two", +// "Yes. a good application", "test"), +// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", +// "test"), +// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), +// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), +// schemaOriginal); +// +// Dataset datasetExpected = spark.createDataFrame( +// Arrays.asList( +// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), +// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), +// RowFactory.create("30", "written java scala", "four", "", "test"), +// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), +// schemaOriginal); + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData3Original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData3Expected(), SchemaOriginal.class); - Dataset datasetExpected = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), - RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), - RowFactory.create("30", "written java scala", "four", "", "test"), - RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), - schemaOriginal); String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWordsWithoutHeader.csv").getFile(); FieldDefinition fd = new FieldDefinition(); fd.setStopWords(stopWordsFileName); From 47123e208911bb119b6c83223d53f09f98ef2834 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:15:18 +0530 Subject: [PATCH 174/219] initial changes stop words --- .../core/preprocess/TestSparkStopWords.java | 67 ++++- .../common/core/preprocess/TestStopWords.java | 264 +++--------------- .../core/util/SampleStopWordRemover.java | 63 +++++ 3 files changed, 168 insertions(+), 226 deletions(-) create mode 100644 spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java index d7eaf27a4..5d925cd33 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -1,4 +1,69 @@ package zingg.common.core.preprocess; -public class TestSparkStopWords { +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.WithSession; +import zingg.common.core.util.SampleStopWordRemover; +import zingg.spark.client.util.SparkDFObjectUtil; +import zingg.spark.client.util.WithSparkSession; +import zingg.spark.core.context.ZinggSparkContext; + +public class TestSparkStopWords extends TestStopWords, Row, Column, DataType> { + + public static final Log LOG = LogFactory.getLog(TestSparkStopWords.class); + public static IArguments args; + public static JavaSparkContext ctx; + public static SparkSession spark; + public static ZinggSparkContext zsCTX; + public static WithSession withSession; + + @BeforeAll + public static void setup() { + setUpSpark(); + } + + public TestSparkStopWords() throws ZinggClientException { + super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, args), zsCTX); + } + + protected static void setUpSpark() { + try { + spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + ctx = new JavaSparkContext(spark.sparkContext()); + withSession = new WithSparkSession(); + withSession.setSession(spark); + zsCTX = new ZinggSparkContext(); + zsCTX.init(spark); + } catch (Throwable e) { + if (LOG.isDebugEnabled()) + e.printStackTrace(); + LOG.info("Problem in spark env setup"); + } + } + + @AfterAll + public static void teardown() { + if (ctx != null) { + ctx.stop(); + ctx = null; + } + if (spark != null) { + spark.stop(); + spark = null; + } + } } diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java index d26c3db4a..ac8255cdc 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java @@ -27,284 +27,98 @@ import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; +import zingg.common.core.context.Context; import zingg.common.core.data.Constant; import zingg.common.core.model.Event; import zingg.common.core.model.Schema; +import zingg.common.core.model.SchemaActual; import zingg.common.core.model.SchemaOriginal; import zingg.spark.client.SparkFrame; +import zingg.spark.core.context.ZinggSparkContext; import zingg.spark.core.executor.ZinggSparkTester; import zingg.spark.core.preprocess.SparkStopWordsRemover; -public class TestStopWords{ +public abstract class TestStopWords { public static final Log LOG = LogFactory.getLog(TestStopWords.class); - public final DFObjectUtil dfObjectUtil; - public final StopWordsRemover stopWordsRemover; - public IArguments args = new Arguments(); + private final DFObjectUtil dfObjectUtil; + private final List> stopWordsRemovers; + private final Context context; - public TestStopWords(DFObjectUtil dfObjectUtil, StopWordsRemover stopWordsRemover) { + public TestStopWords(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, + Context context) { this.dfObjectUtil = dfObjectUtil; - this.stopWordsRemover = stopWordsRemover; + this.stopWordsRemovers = stopWordsRemovers; + this.context = context; } @DisplayName ("Test Stop Words removal from Single column dataset") @Test public void testStopWordsSingleColumn() throws ZinggClientException, Exception { - -// StructType schema = new StructType(new StructField[] { -// new StructField("statement", DataTypes.StringType, false, Metadata.empty()) -// }); -// -// Dataset datasetOriginal = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("The zingg is a Spark application"), -// RowFactory.create("It is very popular in data Science"), -// RowFactory.create("It is written in Java and Scala"), -// RowFactory.create("Best of luck to zingg")), -// schema); String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); -// Dataset datasetExpected = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("zingg spark application"), -// RowFactory.create("very popular in data science"), -// RowFactory.create("written in java and scala"), -// RowFactory.create("best luck to zingg")), -// schema); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); - - List fdList = new ArrayList(4); - - ArrayList matchTypelistFuzzy = new ArrayList(); - matchTypelistFuzzy.add(MatchType.FUZZY); - - FieldDefinition eventFD = new FieldDefinition(); - eventFD.setDataType("string"); - eventFD.setFieldName("statement"); - eventFD.setMatchType(matchTypelistFuzzy); - fdList.add(eventFD); - - IArguments stmtArgs = new Arguments(); - stmtArgs.setFieldDefinition(fdList); - -// StopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,stmtArgs); + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(0); stopWordsRemover.preprocessForStopWords(zFrameOriginal); - System.out.println("datasetOriginal.show() : "); - zFrameOriginal.show(); - ZFrame zFrameWithoutStopWords = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); - System.out.println("datasetWithoutStopWords.show() : "); - zFrameWithoutStopWords.show(); - -// assertTrue(zFrameExpected.except(datasetWithoutStopWords.df()).isEmpty()); -// assertTrue(datasetWithoutStopWords.df().except(datasetExpected).isEmpty()); + ZFrame newZFrame = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); + + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); } @Test public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { -// StructType schemaOriginal = new StructType(new StructField[] { -// new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), -// new StructField("field1", DataTypes.StringType, false, Metadata.empty()), -// new StructField("field2", DataTypes.StringType, false, Metadata.empty()), -// new StructField("field3", DataTypes.StringType, false, Metadata.empty()), -// new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) -// }); - -// Dataset original = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "The zingg is a spark application", "two", -// "Yes. a good application", "test"), -// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", -// "test"), -// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), -// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), -// schemaOriginal); -// Dataset datasetExpected = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), -// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), -// RowFactory.create("30", "written java scala", "four", "", "test"), -// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), -// schemaOriginal); ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData2Original(), SchemaOriginal.class); ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData2Expected(), SchemaOriginal.class); - - String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWords.csv").getFile(); - FieldDefinition fd = new FieldDefinition(); - fd.setStopWords(stopWordsFileName); - fd.setFieldName("field1"); - - List fieldDefinitionList = Arrays.asList(fd); -// args.setFieldDefinition(fieldDefinitionList); -// SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); - - ZFrame zFrameNew = stopWordsRemover.preprocessForStopWords(zFrameOriginal); + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(1); + ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); -// Dataset newDataSet = ((SparkFrame)(stopWordsRemover.preprocessForStopWords(new SparkFrame(original)))).df(); -// assertTrue(datasetExpected.except(newDataSet).isEmpty()); -// assertTrue(newDataSet.except(datasetExpected).isEmpty()); + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); } @Test public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); - -// Dataset original = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "The zingg is a spark application", "two", -// "Yes. a good application", "test"), -// RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed", -// "test"), -// RowFactory.create("30", "It is written in java and scala", "four", "", "test"), -// RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")), -// schemaOriginal); -// -// Dataset datasetExpected = spark.createDataFrame( -// Arrays.asList( -// RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"), -// RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"), -// RowFactory.create("30", "written java scala", "four", "", "test"), -// RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")), -// schemaOriginal); ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData3Original(), SchemaOriginal.class); ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData3Expected(), SchemaOriginal.class); - String stopWordsFileName = getClass().getResource("../../../../preProcess/stopWordsWithoutHeader.csv").getFile(); - FieldDefinition fd = new FieldDefinition(); - fd.setStopWords(stopWordsFileName); - fd.setFieldName("field1"); + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); + ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - List fieldDefinitionList = Arrays.asList(fd); - args.setFieldDefinition(fieldDefinitionList); - - SparkStopWordsRemover stopWordsObj = new SparkStopWordsRemover(zsCTX,args); - - System.out.println("testStopWordColumnMissingFromStopWordFile : orginal "); - original.show(200); - Dataset newDataSet = ((SparkFrame)(stopWordsObj.preprocessForStopWords(new SparkFrame(original)))).df(); - System.out.println("testStopWordColumnMissingFromStopWordFile : newDataSet "); - newDataSet.show(200); - System.out.println("testStopWordColumnMissingFromStopWordFile : datasetExpected "); - datasetExpected.show(200); - assertTrue(datasetExpected.except(newDataSet).isEmpty()); - assertTrue(newDataSet.except(datasetExpected).isEmpty()); + assertTrue(zFrameExpected.except(newDataSet).isEmpty()); + assertTrue(newDataSet.except(zFrameExpected).isEmpty()); } @Test - public void testForOriginalDataAfterPostprocess() { - StructType schemaActual = new StructType(new StructField[] { - new StructField(ColName.CLUSTER_COLUMN, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.PREDICTION_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SCORE_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.MATCH_FLAG_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); - - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); + public void testForOriginalDataAfterPostProcess() throws Exception { - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in data science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg", "Five", "thank you", "test")), - schemaOriginal); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData4original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData4Expected(), SchemaActual.class); - Dataset actual = spark.createDataFrame( - Arrays.asList( - RowFactory.create("1648811730857:10", "10", "1.0", "0.555555", "-1", - "The zingg spark application", "two", "Yes. good application", "test"), - RowFactory.create("1648811730857:20", "20", "1.0", "1.0", "-1", - "It very popular data science", "Three", "true indeed", "test"), - RowFactory.create("1648811730857:30", "30", "1.0", "0.999995", "-1", - "It written java scala", "four", "", "test"), - RowFactory.create("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", - "thank", "test")), - schemaActual); + ZFrame newZFrame = context.getDSUtil().postprocess(zFrameExpected, zFrameOriginal); - Dataset newDataset = ((SparkFrame)(zsCTX.getDSUtil().postprocess(new SparkFrame(actual), new SparkFrame(original)))).df(); - assertTrue(newDataset.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL).except(original).isEmpty()); - assertTrue(original.except(newDataset.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL)).isEmpty()); + assertTrue(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL).except(zFrameOriginal).isEmpty()); + assertTrue(zFrameOriginal.except(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL)).isEmpty()); } @Test - public void testOriginalDataAfterPostprocessLinked() { - StructType schemaActual = new StructType(new StructField[] { - new StructField(ColName.CLUSTER_COLUMN, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.PREDICTION_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SCORE_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.MATCH_FLAG_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); + public void testOriginalDataAfterPostProcessLinked() throws Exception { - StructType schemaOriginal = new StructType(new StructField[] { - new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()), - new StructField("field1", DataTypes.StringType, false, Metadata.empty()), - new StructField("field2", DataTypes.StringType, false, Metadata.empty()), - new StructField("field3", DataTypes.StringType, false, Metadata.empty()), - new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty()) - }); - - Dataset original = spark.createDataFrame( - Arrays.asList( - RowFactory.create("10", "The zingg is a spark application", "two", - "Yes. a good application", "test"), - RowFactory.create("20", "It is very popular in data science", "Three", "true indeed", - "test"), - RowFactory.create("30", "It is written in java and scala", "four", "", "test"), - RowFactory.create("40", "Best of luck to zingg", "Five", "thank you", "test")), - schemaOriginal); - - Dataset actual = spark.createDataFrame( - Arrays.asList( - RowFactory.create("1648811730857:10", "10", "1.0", "0.555555", "-1", - "The zingg spark application", "two", "Yes. good application", "test"), - RowFactory.create("1648811730857:20", "20", "1.0", "1.0", "-1", - "It very popular data science", "Three", "true indeed", "test"), - RowFactory.create("1648811730857:30", "30", "1.0", "0.999995", "-1", - "It written java scala", "four", "", "test"), - RowFactory.create("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", - "thank", "test")), - schemaActual); - - System.out.println("testOriginalDataAfterPostprocessLinked original :"); - original.show(200); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData5Original(), SchemaOriginal.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData5Actual(), SchemaActual.class); - Dataset newDataset = ((SparkFrame)(zsCTX.getDSUtil().postprocessLinked(new SparkFrame(actual), new SparkFrame(original)))).df(); + ZFrame newZFrame = context.getDSUtil().postprocessLinked(zFrameExpected, zFrameOriginal); - System.out.println("testOriginalDataAfterPostprocessLinked newDataset :"); - newDataset.show(200); - - assertTrue(newDataset.select("field1", "field2", "field3").except(original.select("field1", "field2", "field3")).isEmpty()); - assertTrue(original.select("field1", "field2", "field3").except(newDataset.select("field1", "field2", "field3")).isEmpty()); + assertTrue(newZFrame.select("field1", "field2", "field3").except(zFrameOriginal.select("field1", "field2", "field3")).isEmpty()); + assertTrue(zFrameOriginal.select("field1", "field2", "field3").except(newZFrame.select("field1", "field2", "field3")).isEmpty()); } + } \ No newline at end of file diff --git a/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java b/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java new file mode 100644 index 000000000..fe839e593 --- /dev/null +++ b/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java @@ -0,0 +1,63 @@ +package zingg.common.core.util; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import zingg.common.client.Arguments; +import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; +import zingg.common.client.MatchType; +import zingg.common.client.ZinggClientException; +import zingg.common.core.preprocess.StopWordsRemover; +import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.preprocess.SparkStopWordsRemover; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class SampleStopWordRemover { + + public static List, Row, Column, DataType>> getStopWordRemovers(ZinggSparkContext zsCTX, + IArguments args) throws ZinggClientException { + + List, Row, Column, DataType>> sparkStopWordsRemovers = new ArrayList<>(); + + //add first stopWordRemover + List fdList = new ArrayList(4); + ArrayList matchTypelistFuzzy = new ArrayList(); + matchTypelistFuzzy.add(MatchType.FUZZY); + FieldDefinition eventFD = new FieldDefinition(); + eventFD.setDataType("string"); + eventFD.setFieldName("statement"); + eventFD.setMatchType(matchTypelistFuzzy); + fdList.add(eventFD); + IArguments stmtArgs = new Arguments(); + stmtArgs.setFieldDefinition(fdList); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX,stmtArgs)); + + //add second stopWordRemover + String stopWordsFileName1 = Objects.requireNonNull( + SampleStopWordRemover.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); + FieldDefinition fieldDefinition1 = new FieldDefinition(); + fieldDefinition1.setStopWords(stopWordsFileName1); + fieldDefinition1.setFieldName("field1"); + List fieldDefinitionList1 = List.of(fieldDefinition1); + args.setFieldDefinition(fieldDefinitionList1); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + + //add third stopWordRemover + String stopWordsFileName2 = Objects.requireNonNull( + SampleStopWordRemover.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); + FieldDefinition fieldDefinition2 = new FieldDefinition(); + fieldDefinition2.setStopWords(stopWordsFileName2); + fieldDefinition2.setFieldName("field1"); + List fieldDefinitionList2 = List.of(fieldDefinition2); + args.setFieldDefinition(fieldDefinitionList2); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + + return sparkStopWordsRemovers; + } +} From d2d3eaf8a7bbd2c970b812d5482991bdaf314101 Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:17:55 +0530 Subject: [PATCH 175/219] renamed TestStopWords to TestStopWordsBase --- ...Words.java => TestSparkStopWordsBase.java} | 11 ++++---- ...tStopWords.java => TestStopWordsBase.java} | 27 +++---------------- 2 files changed, 9 insertions(+), 29 deletions(-) rename spark/core/src/test/java/zingg/common/core/preprocess/{TestSparkStopWords.java => TestSparkStopWordsBase.java} (86%) rename spark/core/src/test/java/zingg/common/core/preprocess/{TestStopWords.java => TestStopWordsBase.java} (80%) diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java similarity index 86% rename from spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java rename to spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java index 5d925cd33..a2efce588 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java @@ -10,7 +10,7 @@ import org.apache.spark.sql.types.DataType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; -import zingg.common.client.IArguments; +import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.util.WithSession; import zingg.common.core.util.SampleStopWordRemover; @@ -18,10 +18,9 @@ import zingg.spark.client.util.WithSparkSession; import zingg.spark.core.context.ZinggSparkContext; -public class TestSparkStopWords extends TestStopWords, Row, Column, DataType> { +public class TestSparkStopWordsBase extends TestStopWordsBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkStopWords.class); - public static IArguments args; + public static final Log LOG = LogFactory.getLog(TestSparkStopWordsBase.class); public static JavaSparkContext ctx; public static SparkSession spark; public static ZinggSparkContext zsCTX; @@ -32,8 +31,8 @@ public static void setup() { setUpSpark(); } - public TestSparkStopWords() throws ZinggClientException { - super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, args), zsCTX); + public TestSparkStopWordsBase() throws ZinggClientException { + super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, new Arguments()), zsCTX); } protected static void setUpSpark() { diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java similarity index 80% rename from spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java rename to spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index ac8255cdc..0a56b778d 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -2,51 +2,32 @@ import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; -import zingg.common.client.Arguments; -import zingg.common.client.ArgumentsUtil; -import zingg.common.client.FieldDefinition; -import zingg.common.client.IArguments; -import zingg.common.client.MatchType; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.core.context.Context; import zingg.common.core.data.Constant; -import zingg.common.core.model.Event; import zingg.common.core.model.Schema; import zingg.common.core.model.SchemaActual; import zingg.common.core.model.SchemaOriginal; -import zingg.spark.client.SparkFrame; -import zingg.spark.core.context.ZinggSparkContext; -import zingg.spark.core.executor.ZinggSparkTester; -import zingg.spark.core.preprocess.SparkStopWordsRemover; -public abstract class TestStopWords { +public abstract class TestStopWordsBase { - public static final Log LOG = LogFactory.getLog(TestStopWords.class); + public static final Log LOG = LogFactory.getLog(TestStopWordsBase.class); private final DFObjectUtil dfObjectUtil; private final List> stopWordsRemovers; private final Context context; - public TestStopWords(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, - Context context) { + public TestStopWordsBase(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, + Context context) { this.dfObjectUtil = dfObjectUtil; this.stopWordsRemovers = stopWordsRemovers; this.context = context; From 0b3ca28664d63b676886d607bbb650d90f204a8d Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:46:46 +0530 Subject: [PATCH 176/219] initial commit --- .../common/core/block/TestBlockBase.java | 6 +- .../data/{Constant.java => TestData.java} | 128 +++++++++--------- ...maActual.java => PostStopWordProcess.java} | 6 +- ...riginal.java => PriorStopWordProcess.java} | 4 +- .../model/{Schema.java => Statement.java} | 4 +- .../core/preprocess/TestStopWordsBase.java | 37 ++--- 6 files changed, 94 insertions(+), 91 deletions(-) rename common/core/src/test/java/zingg/common/core/data/{Constant.java => TestData.java} (67%) rename common/core/src/test/java/zingg/common/core/model/{SchemaActual.java => PostStopWordProcess.java} (72%) rename common/core/src/test/java/zingg/common/core/model/{SchemaOriginal.java => PriorStopWordProcess.java} (70%) rename common/core/src/test/java/zingg/common/core/model/{Schema.java => Statement.java} (63%) diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index a8f746c47..3d4f440af 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -18,7 +18,7 @@ import zingg.common.core.util.HashUtil; import zingg.common.core.model.Event; import zingg.common.core.model.EventCluster; -import zingg.common.core.data.Constant; +import zingg.common.core.data.TestData; public abstract class TestBlockBase { @@ -37,8 +37,8 @@ public TestBlockBase(DFObjectUtil dfObjectUtil, HashUtil zFrameEvent = dfObjectUtil.getDFFromObjectList(Constant.createSampleEventData(), Event.class); - ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(Constant.createSampleClusterEventData(), EventCluster.class); + ZFrame zFrameEvent = dfObjectUtil.getDFFromObjectList(TestData.createSampleEventData(), Event.class); + ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(TestData.createSampleClusterEventData(), EventCluster.class); IArguments args = getArguments(); Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(zFrameEvent, zFrameEventCluster, 0.5, -1, diff --git a/common/core/src/test/java/zingg/common/core/data/Constant.java b/common/core/src/test/java/zingg/common/core/data/TestData.java similarity index 67% rename from common/core/src/test/java/zingg/common/core/data/Constant.java rename to common/core/src/test/java/zingg/common/core/data/TestData.java index 0418f9810..7418ac21c 100644 --- a/common/core/src/test/java/zingg/common/core/data/Constant.java +++ b/common/core/src/test/java/zingg/common/core/data/TestData.java @@ -2,14 +2,14 @@ import zingg.common.core.model.Event; import zingg.common.core.model.EventCluster; -import zingg.common.core.model.Schema; -import zingg.common.core.model.SchemaActual; -import zingg.common.core.model.SchemaOriginal; +import zingg.common.core.model.Statement; +import zingg.common.core.model.PostStopWordProcess; +import zingg.common.core.model.PriorStopWordProcess; import java.util.ArrayList; import java.util.List; -public class Constant { +public class TestData { public static List createSampleEventData() { int row_id = 1; @@ -136,127 +136,127 @@ public static List createSampleClusterEventData() { return sample; } - public static List getData1Original() { + public static List getData1Original() { - List sample = new ArrayList<>(); - sample.add(new Schema("The zingg is a Spark application")); - sample.add(new Schema("It is very popular in data Science")); - sample.add(new Schema("It is written in Java and Scala")); - sample.add(new Schema("Best of luck to zingg")); + List sample = new ArrayList<>(); + sample.add(new Statement("The zingg is a Spark application")); + sample.add(new Statement("It is very popular in data Science")); + sample.add(new Statement("It is written in Java and Scala")); + sample.add(new Statement("Best of luck to zingg")); return sample; } - public static List getData1Expected() { + public static List getData1Expected() { - List sample = new ArrayList<>(); - sample.add(new Schema("zingg spark application")); - sample.add(new Schema("very popular in data science")); - sample.add(new Schema("written in java and scala")); - sample.add(new Schema("best luck to zingg")); + List sample = new ArrayList<>(); + sample.add(new Statement("zingg spark application")); + sample.add(new Statement("very popular in data science")); + sample.add(new Statement("written in java and scala")); + sample.add(new Statement("best luck to zingg")); return sample; } - public static List getData2Original() { + public static List getData2Original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in Data Science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); return sample; } - public static List getData2Expected() { + public static List getData2Expected() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new PriorStopWordProcess("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new PriorStopWordProcess("30", "written java scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "best luck to zingg ", "Five", "thank you", "test")); return sample; } - public static List getData3Original() { + public static List getData3Original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in Data Science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in Data Science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")); return sample; } - public static List getData3Expected() { + public static List getData3Expected() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "zingg spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "written java scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "best luck to zingg ", "Five", "thank you", "test")); + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "zingg spark application", "two", "Yes. a good application", "test")); + sample.add(new PriorStopWordProcess("20", "very popular data science", "Three", "true indeed", "test")); + sample.add(new PriorStopWordProcess("30", "written java scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "best luck to zingg ", "Five", "thank you", "test")); return sample; } - public static List getData4original() { + public static List getData4original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg", "Five", "thank you", "test")); return sample; } - public static List getData4Expected() { + public static List getData4Expected() { - List sample = new ArrayList<>(); - sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + List sample = new ArrayList<>(); + sample.add(new PostStopWordProcess("1648811730857:10", "10", "1.0", "0.555555", "-1", "The zingg spark application", "two", "Yes. good application", "test")); - sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + sample.add(new PostStopWordProcess("1648811730857:20", "20", "1.0", "1.0", "-1", "It very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + sample.add(new PostStopWordProcess("1648811730857:30", "30", "1.0", "0.999995", "-1", "It written java scala", "four", "", "test")); - sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + sample.add(new PostStopWordProcess("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", "thank", "test")); return sample; } - public static List getData5Original() { + public static List getData5Original() { - List sample = new ArrayList<>(); - sample.add(new SchemaOriginal("10", "The zingg is a spark application", "two", + List sample = new ArrayList<>(); + sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); - sample.add(new SchemaOriginal("20", "It is very popular in data science", "Three", "true indeed", + sample.add(new PriorStopWordProcess("20", "It is very popular in data science", "Three", "true indeed", "test")); - sample.add(new SchemaOriginal("30", "It is written in java and scala", "four", "", "test")); - sample.add(new SchemaOriginal("40", "Best of luck to zingg", "Five", "thank you", "test")); + sample.add(new PriorStopWordProcess("30", "It is written in java and scala", "four", "", "test")); + sample.add(new PriorStopWordProcess("40", "Best of luck to zingg", "Five", "thank you", "test")); return sample; } - public static List getData5Actual() { + public static List getData5Actual() { - List sample = new ArrayList<>(); - sample.add(new SchemaActual("1648811730857:10", "10", "1.0", "0.555555", "-1", + List sample = new ArrayList<>(); + sample.add(new PostStopWordProcess("1648811730857:10", "10", "1.0", "0.555555", "-1", "The zingg spark application", "two", "Yes. good application", "test")); - sample.add(new SchemaActual("1648811730857:20", "20", "1.0", "1.0", "-1", + sample.add(new PostStopWordProcess("1648811730857:20", "20", "1.0", "1.0", "-1", "It very popular data science", "Three", "true indeed", "test")); - sample.add(new SchemaActual("1648811730857:30", "30", "1.0", "0.999995", "-1", + sample.add(new PostStopWordProcess("1648811730857:30", "30", "1.0", "0.999995", "-1", "It written java scala", "four", "", "test")); - sample.add(new SchemaActual("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", + sample.add(new PostStopWordProcess("1648811730857:40", "40", "1.0", "1.0", "-1", "Best luck zingg", "Five", "thank", "test")); return sample; diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaActual.java b/common/core/src/test/java/zingg/common/core/model/PostStopWordProcess.java similarity index 72% rename from common/core/src/test/java/zingg/common/core/model/SchemaActual.java rename to common/core/src/test/java/zingg/common/core/model/PostStopWordProcess.java index 762420435..c137fa559 100644 --- a/common/core/src/test/java/zingg/common/core/model/SchemaActual.java +++ b/common/core/src/test/java/zingg/common/core/model/PostStopWordProcess.java @@ -1,6 +1,6 @@ package zingg.common.core.model; -public class SchemaActual { +public class PostStopWordProcess { public final String z_cluster; public final String z_zid; public final String z_prediction; @@ -11,8 +11,8 @@ public class SchemaActual { public final String field3; public final String z_zsource; - public SchemaActual(String z_cluster, String z_zid, String z_prediction, String z_score, String z_isMatch, - String field1, String field2, String field3, String z_zsource) { + public PostStopWordProcess(String z_cluster, String z_zid, String z_prediction, String z_score, String z_isMatch, + String field1, String field2, String field3, String z_zsource) { this.z_cluster = z_cluster; this.z_zid = z_zid; this.z_prediction = z_prediction; diff --git a/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java b/common/core/src/test/java/zingg/common/core/model/PriorStopWordProcess.java similarity index 70% rename from common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java rename to common/core/src/test/java/zingg/common/core/model/PriorStopWordProcess.java index 25d55eca2..502a8ef2b 100644 --- a/common/core/src/test/java/zingg/common/core/model/SchemaOriginal.java +++ b/common/core/src/test/java/zingg/common/core/model/PriorStopWordProcess.java @@ -1,13 +1,13 @@ package zingg.common.core.model; -public class SchemaOriginal { +public class PriorStopWordProcess { public final String z_zid; public final String field1; public final String field2; public final String field3; public final String z_zsource; - public SchemaOriginal(String z_zid, String field1, String field2, String field3, String z_zsource) { + public PriorStopWordProcess(String z_zid, String field1, String field2, String field3, String z_zsource) { this.z_zid = z_zid; this.field1 = field1; this.field2 = field2; diff --git a/common/core/src/test/java/zingg/common/core/model/Schema.java b/common/core/src/test/java/zingg/common/core/model/Statement.java similarity index 63% rename from common/core/src/test/java/zingg/common/core/model/Schema.java rename to common/core/src/test/java/zingg/common/core/model/Statement.java index c608bd370..1fabf51ef 100644 --- a/common/core/src/test/java/zingg/common/core/model/Schema.java +++ b/common/core/src/test/java/zingg/common/core/model/Statement.java @@ -1,9 +1,9 @@ package zingg.common.core.model; -public class Schema { +public class Statement { public final String statement; - public Schema(String statement) { + public Statement(String statement) { this.statement = statement; } } diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index 0a56b778d..51ff098b9 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -9,15 +9,17 @@ import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; +import zingg.common.client.Arguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.core.context.Context; -import zingg.common.core.data.Constant; -import zingg.common.core.model.Schema; -import zingg.common.core.model.SchemaActual; -import zingg.common.core.model.SchemaOriginal; +import zingg.common.core.data.TestData; +import zingg.common.core.model.Statement; +import zingg.common.core.model.PostStopWordProcess; +import zingg.common.core.model.PriorStopWordProcess; +import zingg.common.core.util.StopWordRemoverUtility; public abstract class TestStopWordsBase { @@ -26,10 +28,11 @@ public abstract class TestStopWordsBase { private final List> stopWordsRemovers; private final Context context; - public TestStopWordsBase(DFObjectUtil dfObjectUtil, List> stopWordsRemovers, - Context context) { + + public TestStopWordsBase(DFObjectUtil dfObjectUtil, StopWordRemoverUtility stopWordRemoverUtility, + Context context) throws ZinggClientException { this.dfObjectUtil = dfObjectUtil; - this.stopWordsRemovers = stopWordsRemovers; + this.stopWordsRemovers = stopWordRemoverUtility.getStopWordRemovers(context, new Arguments()); this.context = context; } @@ -39,8 +42,8 @@ public void testStopWordsSingleColumn() throws ZinggClientException, Exception { String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData1Original(), Schema.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData1Expected(), Schema.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData1Original(), Statement.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData1Expected(), Statement.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(0); @@ -54,8 +57,8 @@ public void testStopWordsSingleColumn() throws ZinggClientException, Exception { @Test public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData2Original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData2Expected(), SchemaOriginal.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData2Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData2Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(1); ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); @@ -67,8 +70,8 @@ public void testRemoveStopWordsFromDataset() throws ZinggClientException, Except @Test public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData3Original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData3Expected(), SchemaOriginal.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData3Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); @@ -81,8 +84,8 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept @Test public void testForOriginalDataAfterPostProcess() throws Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData4original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData4Expected(), SchemaActual.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData4original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData4Expected(), PostStopWordProcess.class); ZFrame newZFrame = context.getDSUtil().postprocess(zFrameExpected, zFrameOriginal); @@ -93,8 +96,8 @@ public void testForOriginalDataAfterPostProcess() throws Exception { @Test public void testOriginalDataAfterPostProcessLinked() throws Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(Constant.getData5Original(), SchemaOriginal.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(Constant.getData5Actual(), SchemaActual.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData5Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData5Actual(), PostStopWordProcess.class); ZFrame newZFrame = context.getDSUtil().postprocessLinked(zFrameExpected, zFrameOriginal); From ff3320f4ed9a717ecebfa005abe09da9f9411f4f Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 12:30:15 +0530 Subject: [PATCH 177/219] renamed newDataSet to newZFrame --- .../zingg/common/core/preprocess/TestStopWordsBase.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index 51ff098b9..b8210db03 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -74,10 +74,10 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); - ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); + ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - assertTrue(zFrameExpected.except(newDataSet).isEmpty()); - assertTrue(newDataSet.except(zFrameExpected).isEmpty()); + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); } From 2d8960728a81ad23c04674adff2cfa7ce7d6ba22 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:33:13 +0530 Subject: [PATCH 178/219] added TestStopWordsBase --- .../core/preprocess/TestStopWordsBase.java | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java diff --git a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java new file mode 100644 index 000000000..51ff098b9 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -0,0 +1,108 @@ +package zingg.common.core.preprocess; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +import zingg.common.client.Arguments; +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.ColName; +import zingg.common.client.util.DFObjectUtil; +import zingg.common.core.context.Context; +import zingg.common.core.data.TestData; +import zingg.common.core.model.Statement; +import zingg.common.core.model.PostStopWordProcess; +import zingg.common.core.model.PriorStopWordProcess; +import zingg.common.core.util.StopWordRemoverUtility; + +public abstract class TestStopWordsBase { + + public static final Log LOG = LogFactory.getLog(TestStopWordsBase.class); + private final DFObjectUtil dfObjectUtil; + private final List> stopWordsRemovers; + private final Context context; + + + public TestStopWordsBase(DFObjectUtil dfObjectUtil, StopWordRemoverUtility stopWordRemoverUtility, + Context context) throws ZinggClientException { + this.dfObjectUtil = dfObjectUtil; + this.stopWordsRemovers = stopWordRemoverUtility.getStopWordRemovers(context, new Arguments()); + this.context = context; + } + + @DisplayName ("Test Stop Words removal from Single column dataset") + @Test + public void testStopWordsSingleColumn() throws ZinggClientException, Exception { + + String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData1Original(), Statement.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData1Expected(), Statement.class); + + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(0); + + stopWordsRemover.preprocessForStopWords(zFrameOriginal); + ZFrame newZFrame = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); + + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); + } + + @Test + public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData2Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData2Expected(), PriorStopWordProcess.class); + + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(1); + ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); + + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); + } + + @Test + public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData3Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); + + StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); + ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); + + assertTrue(zFrameExpected.except(newDataSet).isEmpty()); + assertTrue(newDataSet.except(zFrameExpected).isEmpty()); + } + + + @Test + public void testForOriginalDataAfterPostProcess() throws Exception { + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData4original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData4Expected(), PostStopWordProcess.class); + + ZFrame newZFrame = context.getDSUtil().postprocess(zFrameExpected, zFrameOriginal); + + assertTrue(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL).except(zFrameOriginal).isEmpty()); + assertTrue(zFrameOriginal.except(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL)).isEmpty()); + } + + @Test + public void testOriginalDataAfterPostProcessLinked() throws Exception { + + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData5Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData5Actual(), PostStopWordProcess.class); + + ZFrame newZFrame = context.getDSUtil().postprocessLinked(zFrameExpected, zFrameOriginal); + + assertTrue(newZFrame.select("field1", "field2", "field3").except(zFrameOriginal.select("field1", "field2", "field3")).isEmpty()); + assertTrue(zFrameOriginal.select("field1", "field2", "field3").except(newZFrame.select("field1", "field2", "field3")).isEmpty()); + } + +} \ No newline at end of file From df7bea9e788f647d5ecbf479ef021b86a7a5420c Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 9 Jul 2024 18:37:11 +0530 Subject: [PATCH 179/219] added SparkStopWordRemoverUtility --- .../core/util/StopWordRemoverUtility.java | 13 +++ ...WordsBase.java => TestSparkStopWords.java} | 11 +- .../core/preprocess/TestStopWordsBase.java | 108 ------------------ ....java => SparkStopWordRemoverUtility.java} | 22 ++-- 4 files changed, 29 insertions(+), 125 deletions(-) create mode 100644 common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java rename spark/core/src/test/java/zingg/common/core/preprocess/{TestSparkStopWordsBase.java => TestSparkStopWords.java} (82%) delete mode 100644 spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java rename spark/core/src/test/java/zingg/common/core/util/{SampleStopWordRemover.java => SparkStopWordRemoverUtility.java} (67%) diff --git a/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java b/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java new file mode 100644 index 000000000..04871a606 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java @@ -0,0 +1,13 @@ +package zingg.common.core.util; + +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; +import zingg.common.core.context.Context; +import zingg.common.core.preprocess.StopWordsRemover; + +import java.util.List; + +public interface StopWordRemoverUtility { + + List> getStopWordRemovers(Context context, IArguments arguments) throws ZinggClientException; +} diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java similarity index 82% rename from spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java rename to spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java index a2efce588..0ba570d03 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWordsBase.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -10,17 +10,16 @@ import org.apache.spark.sql.types.DataType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; -import zingg.common.client.Arguments; import zingg.common.client.ZinggClientException; import zingg.common.client.util.WithSession; -import zingg.common.core.util.SampleStopWordRemover; +import zingg.common.core.util.SparkStopWordRemoverUtility; import zingg.spark.client.util.SparkDFObjectUtil; import zingg.spark.client.util.WithSparkSession; import zingg.spark.core.context.ZinggSparkContext; -public class TestSparkStopWordsBase extends TestStopWordsBase, Row, Column, DataType> { +public class TestSparkStopWords extends TestStopWordsBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkStopWordsBase.class); + public static final Log LOG = LogFactory.getLog(TestSparkStopWords.class); public static JavaSparkContext ctx; public static SparkSession spark; public static ZinggSparkContext zsCTX; @@ -31,8 +30,8 @@ public static void setup() { setUpSpark(); } - public TestSparkStopWordsBase() throws ZinggClientException { - super(new SparkDFObjectUtil(withSession), SampleStopWordRemover.getStopWordRemovers(zsCTX, new Arguments()), zsCTX); + public TestSparkStopWords() throws ZinggClientException { + super(new SparkDFObjectUtil(withSession), new SparkStopWordRemoverUtility(), zsCTX); } protected static void setUpSpark() { diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java deleted file mode 100644 index b8210db03..000000000 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ /dev/null @@ -1,108 +0,0 @@ -package zingg.common.core.preprocess; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.List; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Test; - -import zingg.common.client.Arguments; -import zingg.common.client.ZFrame; -import zingg.common.client.ZinggClientException; -import zingg.common.client.util.ColName; -import zingg.common.client.util.DFObjectUtil; -import zingg.common.core.context.Context; -import zingg.common.core.data.TestData; -import zingg.common.core.model.Statement; -import zingg.common.core.model.PostStopWordProcess; -import zingg.common.core.model.PriorStopWordProcess; -import zingg.common.core.util.StopWordRemoverUtility; - -public abstract class TestStopWordsBase { - - public static final Log LOG = LogFactory.getLog(TestStopWordsBase.class); - private final DFObjectUtil dfObjectUtil; - private final List> stopWordsRemovers; - private final Context context; - - - public TestStopWordsBase(DFObjectUtil dfObjectUtil, StopWordRemoverUtility stopWordRemoverUtility, - Context context) throws ZinggClientException { - this.dfObjectUtil = dfObjectUtil; - this.stopWordsRemovers = stopWordRemoverUtility.getStopWordRemovers(context, new Arguments()); - this.context = context; - } - - @DisplayName ("Test Stop Words removal from Single column dataset") - @Test - public void testStopWordsSingleColumn() throws ZinggClientException, Exception { - - String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); - - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData1Original(), Statement.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData1Expected(), Statement.class); - - StopWordsRemover stopWordsRemover = stopWordsRemovers.get(0); - - stopWordsRemover.preprocessForStopWords(zFrameOriginal); - ZFrame newZFrame = stopWordsRemover.removeStopWordsFromDF(zFrameOriginal,"statement",stopWords); - - assertTrue(zFrameExpected.except(newZFrame).isEmpty()); - assertTrue(newZFrame.except(zFrameExpected).isEmpty()); - } - - @Test - public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { - - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData2Original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData2Expected(), PriorStopWordProcess.class); - - StopWordsRemover stopWordsRemover = stopWordsRemovers.get(1); - ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - - assertTrue(zFrameExpected.except(newZFrame).isEmpty()); - assertTrue(newZFrame.except(zFrameExpected).isEmpty()); - } - - @Test - public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { - - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData3Original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); - - StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); - ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - - assertTrue(zFrameExpected.except(newZFrame).isEmpty()); - assertTrue(newZFrame.except(zFrameExpected).isEmpty()); - } - - - @Test - public void testForOriginalDataAfterPostProcess() throws Exception { - - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData4original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData4Expected(), PostStopWordProcess.class); - - ZFrame newZFrame = context.getDSUtil().postprocess(zFrameExpected, zFrameOriginal); - - assertTrue(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL).except(zFrameOriginal).isEmpty()); - assertTrue(zFrameOriginal.except(newZFrame.select(ColName.ID_COL, "field1", "field2", "field3", ColName.SOURCE_COL)).isEmpty()); - } - - @Test - public void testOriginalDataAfterPostProcessLinked() throws Exception { - - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData5Original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData5Actual(), PostStopWordProcess.class); - - ZFrame newZFrame = context.getDSUtil().postprocessLinked(zFrameExpected, zFrameOriginal); - - assertTrue(newZFrame.select("field1", "field2", "field3").except(zFrameOriginal.select("field1", "field2", "field3")).isEmpty()); - assertTrue(zFrameOriginal.select("field1", "field2", "field3").except(newZFrame.select("field1", "field2", "field3")).isEmpty()); - } - -} \ No newline at end of file diff --git a/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java similarity index 67% rename from spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java rename to spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java index fe839e593..a3bb2a52c 100644 --- a/spark/core/src/test/java/zingg/common/core/util/SampleStopWordRemover.java +++ b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java @@ -10,18 +10,18 @@ import zingg.common.client.IArguments; import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; +import zingg.common.core.context.Context; import zingg.common.core.preprocess.StopWordsRemover; -import zingg.spark.core.context.ZinggSparkContext; import zingg.spark.core.preprocess.SparkStopWordsRemover; import java.util.ArrayList; import java.util.List; import java.util.Objects; -public class SampleStopWordRemover { +public class SparkStopWordRemoverUtility implements StopWordRemoverUtility, Row, Column, DataType> { - public static List, Row, Column, DataType>> getStopWordRemovers(ZinggSparkContext zsCTX, - IArguments args) throws ZinggClientException { + @Override + public List, Row, Column, DataType>> getStopWordRemovers(Context, Row, Column, DataType> context, IArguments arguments) throws ZinggClientException { List, Row, Column, DataType>> sparkStopWordsRemovers = new ArrayList<>(); @@ -36,27 +36,27 @@ public static List, Row, Column, Dat fdList.add(eventFD); IArguments stmtArgs = new Arguments(); stmtArgs.setFieldDefinition(fdList); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX,stmtArgs)); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(context,stmtArgs)); //add second stopWordRemover String stopWordsFileName1 = Objects.requireNonNull( - SampleStopWordRemover.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); + StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); FieldDefinition fieldDefinition1 = new FieldDefinition(); fieldDefinition1.setStopWords(stopWordsFileName1); fieldDefinition1.setFieldName("field1"); List fieldDefinitionList1 = List.of(fieldDefinition1); - args.setFieldDefinition(fieldDefinitionList1); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + arguments.setFieldDefinition(fieldDefinitionList1); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(context, arguments)); //add third stopWordRemover String stopWordsFileName2 = Objects.requireNonNull( - SampleStopWordRemover.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); + StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); FieldDefinition fieldDefinition2 = new FieldDefinition(); fieldDefinition2.setStopWords(stopWordsFileName2); fieldDefinition2.setFieldName("field1"); List fieldDefinitionList2 = List.of(fieldDefinition2); - args.setFieldDefinition(fieldDefinitionList2); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(zsCTX, args)); + arguments.setFieldDefinition(fieldDefinitionList2); + sparkStopWordsRemovers.add(new SparkStopWordsRemover(context, arguments)); return sparkStopWordsRemovers; } From 18b6c40005db544ed95213d634c8bfcef95acbd0 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:43:17 +0530 Subject: [PATCH 180/219] added DFObjectUtil --- .../java/zingg/common/client/util/DFObjectUtil.java | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java diff --git a/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java new file mode 100644 index 000000000..d277074b7 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java @@ -0,0 +1,11 @@ +package zingg.common.client.util; + +import java.util.List; + +import zingg.common.client.ZFrame; + +public abstract class DFObjectUtil { + + public abstract ZFrame getDFFromObjectList(List objList, Class objClass) throws Exception; + +} From ca3e1ca74370b3556c2476640e0baa2929ebc183 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:45:17 +0530 Subject: [PATCH 181/219] added IWithSession --- .../main/java/zingg/common/client/util/IWithSession.java | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 common/client/src/main/java/zingg/common/client/util/IWithSession.java diff --git a/common/client/src/main/java/zingg/common/client/util/IWithSession.java b/common/client/src/main/java/zingg/common/client/util/IWithSession.java new file mode 100644 index 000000000..470405c38 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/IWithSession.java @@ -0,0 +1,9 @@ +package zingg.common.client.util; + +public interface IWithSession { + + public void setSession(S s); + + public S getSession(); + +} \ No newline at end of file From 473603955c598557f37a563ada53ed24a1de61f8 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:46:00 +0530 Subject: [PATCH 182/219] added WithSession --- .../zingg/common/client/util/WithSession.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 common/client/src/main/java/zingg/common/client/util/WithSession.java diff --git a/common/client/src/main/java/zingg/common/client/util/WithSession.java b/common/client/src/main/java/zingg/common/client/util/WithSession.java new file mode 100644 index 000000000..e3d0612b9 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/WithSession.java @@ -0,0 +1,15 @@ +package zingg.common.client.util; + +public class WithSession implements IWithSession { + + S session; + @Override + public void setSession(S session) { + this.session = session; + } + + @Override + public S getSession() { + return session; + } +} From ea2d91bc21df1a01d922cb339c1e13ab0a0ca3f1 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:48:01 +0530 Subject: [PATCH 183/219] added SparkDFObjectUtil --- .../spark/client/util/SparkDFObjectUtil.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java new file mode 100644 index 000000000..840dc9944 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java @@ -0,0 +1,36 @@ +package zingg.spark.client.util; + +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.StructType; + +import zingg.common.client.ZFrame; +import zingg.common.client.util.DFObjectUtil; +import zingg.common.client.util.IWithSession; +import zingg.spark.client.SparkFrame; + +public class SparkDFObjectUtil extends DFObjectUtil, Row, Column> { + + private final IWithSession withSparkSession; + + public SparkDFObjectUtil(IWithSession withSparkSession) { + this.withSparkSession = withSparkSession; + } + + @Override + public ZFrame, Row, Column> getDFFromObjectList(List objList, Class objClass) throws Exception { + if(objList == null || objClass == null) return null; + + SparkStructTypeFromPojoClass stpc = new SparkStructTypeFromPojoClass(); + + List rows = Arrays.asList(RowsFromObjectList.getRows(objList)); + StructType structType = stpc.getStructType(objClass); + return new SparkFrame(withSparkSession.getSession().createDataFrame(rows, structType)); + } + +} From aec8754da112a999ef2d3e9252c79d0a03e0c990 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:50:23 +0530 Subject: [PATCH 184/219] added StructTypeFromPojoClass --- .../client/util/StructTypeFromPojoClass.java | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java diff --git a/common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java b/common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java new file mode 100644 index 000000000..4b3de89bb --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/StructTypeFromPojoClass.java @@ -0,0 +1,34 @@ +package zingg.common.client.util; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.List; + +public abstract class StructTypeFromPojoClass { + + public abstract ST getStructType(Class objClass) throws Exception; + + public List getFields(Class objClass) { + List structFields = new ArrayList(); + Field[] fields = objClass.getDeclaredFields(); + + //add child class fields in struct + for (Field f : fields) { + structFields.add(getStructField(f)); + } + + //add parent class fields in struct + if (objClass.getSuperclass() != null) { + Field[] fieldsSuper = objClass.getSuperclass().getDeclaredFields(); + for (Field f : fieldsSuper) { + structFields.add(getStructField(f)); + } + } + return structFields; + } + + public abstract SF getStructField(Field field); + + public abstract T getSFType(Class t); + +} From 0c19b29873631e58754123fc44ea4b641f391112 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:51:37 +0530 Subject: [PATCH 185/219] added SparkStructTypeFromPojoClass --- .../util/SparkStructTypeFromPojoClass.java | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java b/spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java new file mode 100644 index 000000000..3032907f4 --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkStructTypeFromPojoClass.java @@ -0,0 +1,48 @@ +package zingg.spark.client.util; + +import java.lang.reflect.Field; +import java.security.NoSuchAlgorithmException; +import java.util.List; + +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import zingg.common.client.util.StructTypeFromPojoClass; + +public class SparkStructTypeFromPojoClass extends StructTypeFromPojoClass { + + public StructType getStructType(Class objClass) + throws NoSuchAlgorithmException, IllegalArgumentException, IllegalAccessException { + List structFields = getFields(objClass); + return new StructType(structFields.toArray(new StructField[structFields.size()])); + } + + public StructField getStructField(Field field) { + field.setAccessible(true); + return new StructField(field.getName(), getSFType(field.getType()), true, Metadata.empty()); + } + + public DataType getSFType(Class t) { + if (t.getCanonicalName().contains("String")) { + return DataTypes.StringType; + } else if (t.getCanonicalName().contains("Integer")) { + return DataTypes.IntegerType; + } else if (t.getCanonicalName().contains("Long")) { + return DataTypes.LongType; + } else if (t.getCanonicalName().contains("Float")) { + return DataTypes.FloatType; + } else if (t.getCanonicalName().contains("Double")) { + return DataTypes.DoubleType; + } else if (t.getCanonicalName().contains("Date")) { + return DataTypes.DateType; + } else if (t.getCanonicalName().contains("Timestamp")) { + return DataTypes.TimestampType; + } + + return null; + } + +} \ No newline at end of file From 52aecb543260b9c778209d36b09f537ae17dbf22 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:52:52 +0530 Subject: [PATCH 186/219] added RowsFromObjectList --- .../spark/client/util/RowsFromObjectList.java | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java diff --git a/spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java b/spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java new file mode 100644 index 000000000..cb1a635be --- /dev/null +++ b/spark/client/src/main/java/zingg/spark/client/util/RowsFromObjectList.java @@ -0,0 +1,18 @@ +package zingg.spark.client.util; + +import java.util.List; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; + +import zingg.common.client.util.PojoToArrayConverter; + +public class RowsFromObjectList { + + public static Row[] getRows(List t) throws Exception{ + Row[] rows = new Row[t.size()]; + for (int i=0; i < t.size(); ++i){ + rows[i] = RowFactory.create(PojoToArrayConverter.getObjectArray(t.get(i))); + } + return rows; + } +} From 5be021fc43dc36cb9030f6f0486ad81db10d1206 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:54:32 +0530 Subject: [PATCH 187/219] added PojoToArrayConverter --- .../client/util/PojoToArrayConverter.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java diff --git a/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java b/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java new file mode 100644 index 000000000..a04e60b68 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/PojoToArrayConverter.java @@ -0,0 +1,40 @@ +package zingg.common.client.util; + +import java.lang.reflect.Field; + +public class PojoToArrayConverter { + + public static Object[] getObjectArray(Object object) throws IllegalAccessException { + Field[] fieldsInChildClass = object.getClass().getDeclaredFields(); + Field[] fieldsInParentClass = null; + + int fieldCountInChildClass = fieldsInChildClass.length; + int fieldCount = fieldCountInChildClass; + + if (object.getClass().getSuperclass() != null) { + fieldCount += object.getClass().getSuperclass().getDeclaredFields().length; + fieldsInParentClass = object.getClass().getSuperclass().getDeclaredFields(); + } + + //fieldCount = fieldCountChild + fieldCountParent + Object[] objArr = new Object[fieldCount]; + + int idx = 0; + + //iterate through child class fields + for (; idx < fieldCountInChildClass; idx++) { + Field field = fieldsInChildClass[idx]; + field.setAccessible(true); + objArr[idx] = field.get(object); + } + + //iterate through super class fields + for (; idx < fieldCount; idx++) { + Field field = fieldsInParentClass[idx - fieldCountInChildClass]; + field.setAccessible(true); + objArr[idx] = field.get(object); + } + + return objArr; + } +} From 66e9d712b28697722a960c5e8f2848acff19206f Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 13:56:48 +0530 Subject: [PATCH 188/219] used IWithSession in TestBlock --- .../java/zingg/common/core/block/TestSparkBlock.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java index 26f5a1652..92e3a42f9 100644 --- a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java +++ b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java @@ -11,9 +11,9 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import zingg.common.client.IArguments; +import zingg.common.client.util.IWithSession; import zingg.common.client.util.WithSession; import zingg.spark.client.util.SparkDFObjectUtil; -import zingg.spark.client.util.WithSparkSession; import zingg.spark.core.context.ZinggSparkContext; import zingg.spark.core.util.SparkBlockingTreeUtil; import zingg.spark.core.util.SparkHashUtil; @@ -25,10 +25,10 @@ public class TestSparkBlock extends TestBlockBase, Ro public static JavaSparkContext ctx; public static ZinggSparkContext zsCTX; public static SparkSession spark; - public static WithSession withSession; + public static IWithSession iWithSession; public TestSparkBlock() { - super(new SparkDFObjectUtil(withSession), new SparkHashUtil(spark), new SparkBlockingTreeUtil(spark, zsCTX.getPipeUtil())); + super(new SparkDFObjectUtil(iWithSession), new SparkHashUtil(spark), new SparkBlockingTreeUtil(spark, zsCTX.getPipeUtil())); } @BeforeAll @@ -44,8 +44,8 @@ protected static void setUpSpark() { .appName("Zingg" + "Junit") .getOrCreate(); ctx = new JavaSparkContext(spark.sparkContext()); - withSession = new WithSparkSession(); - withSession.setSession(spark); + iWithSession = new WithSession<>(); + iWithSession.setSession(spark); zsCTX = new ZinggSparkContext(); zsCTX.init(spark); } catch (Throwable e) { From c0f3ee253f7ae4143b14a70122d3dba5cc5889e6 Mon Sep 17 00:00:00 2001 From: administrator Date: Wed, 10 Jul 2024 14:07:27 +0530 Subject: [PATCH 189/219] added IStopWordRemoverUtility --- .../java/zingg/common/core/block/TestBlockBase.java | 3 ++- .../common/core/preprocess/TestStopWordsBase.java | 12 ++++++------ ...overUtility.java => IStopWordRemoverUtility.java} | 2 +- .../common/core/preprocess/TestSparkStopWords.java | 10 +++++----- .../core/util/SparkStopWordRemoverUtility.java | 6 +++--- 5 files changed, 17 insertions(+), 16 deletions(-) rename common/core/src/test/java/zingg/common/core/util/{StopWordRemoverUtility.java => IStopWordRemoverUtility.java} (87%) diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 3d4f440af..821a6985a 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Objects; import org.junit.jupiter.api.Test; @@ -51,7 +52,7 @@ public void testTree() throws Throwable { } private IArguments getArguments() throws ZinggClientException { - String configFilePath = getClass().getResource("../../testFebrl/config.json").getFile(); + String configFilePath = Objects.requireNonNull(getClass().getResource("../../../../testFebrl/config.json")).getFile(); IArguments args = argumentsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); diff --git a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index 51ff098b9..0c4d496a9 100644 --- a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -19,7 +19,7 @@ import zingg.common.core.model.Statement; import zingg.common.core.model.PostStopWordProcess; import zingg.common.core.model.PriorStopWordProcess; -import zingg.common.core.util.StopWordRemoverUtility; +import zingg.common.core.util.IStopWordRemoverUtility; public abstract class TestStopWordsBase { @@ -29,10 +29,10 @@ public abstract class TestStopWordsBase { private final Context context; - public TestStopWordsBase(DFObjectUtil dfObjectUtil, StopWordRemoverUtility stopWordRemoverUtility, + public TestStopWordsBase(DFObjectUtil dfObjectUtil, IStopWordRemoverUtility IStopWordRemoverUtility, Context context) throws ZinggClientException { this.dfObjectUtil = dfObjectUtil; - this.stopWordsRemovers = stopWordRemoverUtility.getStopWordRemovers(context, new Arguments()); + this.stopWordsRemovers = IStopWordRemoverUtility.getStopWordRemovers(context, new Arguments()); this.context = context; } @@ -74,10 +74,10 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); - ZFrame newDataSet = stopWordsRemover.preprocessForStopWords(zFrameOriginal); + ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); - assertTrue(zFrameExpected.except(newDataSet).isEmpty()); - assertTrue(newDataSet.except(zFrameExpected).isEmpty()); + assertTrue(zFrameExpected.except(newZFrame).isEmpty()); + assertTrue(newZFrame.except(zFrameExpected).isEmpty()); } diff --git a/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java b/common/core/src/test/java/zingg/common/core/util/IStopWordRemoverUtility.java similarity index 87% rename from common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java rename to common/core/src/test/java/zingg/common/core/util/IStopWordRemoverUtility.java index 04871a606..d7e74f8f1 100644 --- a/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java +++ b/common/core/src/test/java/zingg/common/core/util/IStopWordRemoverUtility.java @@ -7,7 +7,7 @@ import java.util.List; -public interface StopWordRemoverUtility { +public interface IStopWordRemoverUtility { List> getStopWordRemovers(Context context, IArguments arguments) throws ZinggClientException; } diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java index 0ba570d03..4c9ebc025 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -11,10 +11,10 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import zingg.common.client.ZinggClientException; +import zingg.common.client.util.IWithSession; import zingg.common.client.util.WithSession; import zingg.common.core.util.SparkStopWordRemoverUtility; import zingg.spark.client.util.SparkDFObjectUtil; -import zingg.spark.client.util.WithSparkSession; import zingg.spark.core.context.ZinggSparkContext; public class TestSparkStopWords extends TestStopWordsBase, Row, Column, DataType> { @@ -23,7 +23,7 @@ public class TestSparkStopWords extends TestStopWordsBase withSession; + public static IWithSession iWithSession; @BeforeAll public static void setup() { @@ -31,7 +31,7 @@ public static void setup() { } public TestSparkStopWords() throws ZinggClientException { - super(new SparkDFObjectUtil(withSession), new SparkStopWordRemoverUtility(), zsCTX); + super(new SparkDFObjectUtil(iWithSession), new SparkStopWordRemoverUtility(), zsCTX); } protected static void setUpSpark() { @@ -42,8 +42,8 @@ protected static void setUpSpark() { .appName("Zingg" + "Junit") .getOrCreate(); ctx = new JavaSparkContext(spark.sparkContext()); - withSession = new WithSparkSession(); - withSession.setSession(spark); + iWithSession = new WithSession<>(); + iWithSession.setSession(spark); zsCTX = new ZinggSparkContext(); zsCTX.init(spark); } catch (Throwable e) { diff --git a/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java index a3bb2a52c..8057c636e 100644 --- a/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java +++ b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java @@ -18,7 +18,7 @@ import java.util.List; import java.util.Objects; -public class SparkStopWordRemoverUtility implements StopWordRemoverUtility, Row, Column, DataType> { +public class SparkStopWordRemoverUtility implements IStopWordRemoverUtility, Row, Column, DataType> { @Override public List, Row, Column, DataType>> getStopWordRemovers(Context, Row, Column, DataType> context, IArguments arguments) throws ZinggClientException { @@ -40,7 +40,7 @@ public List, Row, Column, DataType>> //add second stopWordRemover String stopWordsFileName1 = Objects.requireNonNull( - StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); + IStopWordRemoverUtility.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); FieldDefinition fieldDefinition1 = new FieldDefinition(); fieldDefinition1.setStopWords(stopWordsFileName1); fieldDefinition1.setFieldName("field1"); @@ -50,7 +50,7 @@ public List, Row, Column, DataType>> //add third stopWordRemover String stopWordsFileName2 = Objects.requireNonNull( - StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); + IStopWordRemoverUtility.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); FieldDefinition fieldDefinition2 = new FieldDefinition(); fieldDefinition2.setStopWords(stopWordsFileName2); fieldDefinition2.setFieldName("field1"); From 2585299d6d8ec3d0d0a4cd07c1d00a2919be282d Mon Sep 17 00:00:00 2001 From: administrator Date: Thu, 11 Jul 2024 14:12:14 +0530 Subject: [PATCH 190/219] fix for multiple simpleFunction register --- .../java/zingg/spark/core/model/VectorValueExtractor.java | 6 +++++- .../zingg/spark/core/preprocess/SparkStopWordsRemover.java | 6 +++++- .../java/zingg/spark/core/similarity/SparkTransformer.java | 6 +++++- .../src/main/java/zingg/spark/core/util/SparkHashUtil.java | 6 +++++- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java index 0227964e4..995f03cc0 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java +++ b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java @@ -26,7 +26,11 @@ public Double call(Vector v) { @Override public void register(SparkSession spark) { - spark.udf().register(uid, (UDF1) this, DataTypes.DoubleType); + + //only register udf if it is not registered already + if (!spark.catalog().functionExists(uid)) { + spark.udf().register(uid, (UDF1) this, DataTypes.DoubleType); + } } /*@Override diff --git a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java index 3d4bfe9c4..512452af6 100644 --- a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java +++ b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java @@ -46,7 +46,11 @@ protected String registerUDF() { String udfName = removeStopWordsUDF.getName(); // register the UDF SparkSession zSession = getContext().getSession(); - zSession.udf().register(udfName, removeStopWordsUDF, DataTypes.StringType); + + //only register udf if it is already not registered + if (!zSession.catalog().functionExists(udfName)) { + zSession.udf().register(udfName, removeStopWordsUDF, DataTypes.StringType); + } return udfName; } diff --git a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java index dc6255ca2..259f615fe 100644 --- a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java +++ b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java @@ -26,7 +26,11 @@ public SparkTransformer(String inputCol, SparkSimFunction function, String outpu public void register(SparkSession spark) { - spark.udf().register(getUid(), (UDF2) function, DataTypes.DoubleType); + + //only register udf if it is not registered already + if (!spark.catalog().functionExists(getUid())) { + spark.udf().register(getUid(), (UDF2) function, DataTypes.DoubleType); + } } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java index fcaa48a77..9d786a16d 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java @@ -21,7 +21,11 @@ public SparkHashUtil(SparkSession spark) { public HashFunction, Row, Column,DataType> registerHashFunction(HashFnFromConf scriptArg) { HashFunction, Row, Column,DataType> fn = new SparkHashFunctionRegistry().getFunction(scriptArg.getName()); - getSessionObj().udf().register(fn.getName(), (UDF1) fn, fn.getReturnType()); + + //register udf only if it is not registered already + if (!getSessionObj().catalog().functionExists(fn.getName())) { + getSessionObj().udf().register(fn.getName(), (UDF1) fn, fn.getReturnType()); + } return fn; } From db4bc97fc38f3478c3181062481eea000d7a510f Mon Sep 17 00:00:00 2001 From: administrator Date: Thu, 11 Jul 2024 17:48:08 +0530 Subject: [PATCH 191/219] added SparkFnRegistrar --- .../core/model/VectorValueExtractor.java | 6 ++--- .../preprocess/SparkStopWordsRemover.java | 6 ++--- .../core/similarity/SparkTransformer.java | 6 ++--- .../spark/core/util/SparkFnRegistrar.java | 25 +++++++++++++++++++ .../zingg/spark/core/util/SparkHashUtil.java | 5 +--- .../src/test/java/zingg/TestImageType.java | 19 +++++++------- 6 files changed, 42 insertions(+), 25 deletions(-) create mode 100644 spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java diff --git a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java index 995f03cc0..ca5251c5e 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java +++ b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java @@ -9,6 +9,7 @@ import org.apache.spark.sql.SparkSession; import zingg.spark.core.similarity.SparkBaseTransformer; +import zingg.spark.core.util.SparkFnRegistrar; public class VectorValueExtractor extends SparkBaseTransformer implements UDF1{ @@ -27,10 +28,7 @@ public Double call(Vector v) { @Override public void register(SparkSession spark) { - //only register udf if it is not registered already - if (!spark.catalog().functionExists(uid)) { - spark.udf().register(uid, (UDF1) this, DataTypes.DoubleType); - } + SparkFnRegistrar.registerSparkFunctionUDF1(spark, uid, this, DataTypes.DoubleType); } /*@Override diff --git a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java index 512452af6..b02a0f4c1 100644 --- a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java +++ b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java @@ -19,6 +19,7 @@ import zingg.common.core.preprocess.StopWordsRemover; import zingg.spark.client.SparkFrame; import org.apache.spark.sql.SparkSession; +import zingg.spark.core.util.SparkFnRegistrar; public class SparkStopWordsRemover extends StopWordsRemover,Row,Column,DataType> implements Serializable { @@ -47,10 +48,7 @@ protected String registerUDF() { // register the UDF SparkSession zSession = getContext().getSession(); - //only register udf if it is already not registered - if (!zSession.catalog().functionExists(udfName)) { - zSession.udf().register(udfName, removeStopWordsUDF, DataTypes.StringType); - } + SparkFnRegistrar.registerSparkFunctionUDF2(zSession, udfName, removeStopWordsUDF, DataTypes.StringType); return udfName; } diff --git a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java index 259f615fe..f59bfcafa 100644 --- a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java +++ b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java @@ -6,6 +6,7 @@ import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.SparkSession; +import zingg.spark.core.util.SparkFnRegistrar; public class SparkTransformer extends SparkBaseTransformer { @@ -27,10 +28,7 @@ public SparkTransformer(String inputCol, SparkSimFunction function, String outpu public void register(SparkSession spark) { - //only register udf if it is not registered already - if (!spark.catalog().functionExists(getUid())) { - spark.udf().register(getUid(), (UDF2) function, DataTypes.DoubleType); - } + SparkFnRegistrar.registerSparkFunctionUDF2(spark, getUid(), function, DataTypes.DoubleType); } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java b/spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java new file mode 100644 index 000000000..130a94054 --- /dev/null +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java @@ -0,0 +1,25 @@ +package zingg.spark.core.util; + +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.api.java.UDF1; +import org.apache.spark.sql.api.java.UDF2; +import org.apache.spark.sql.types.DataType; + +public class SparkFnRegistrar { + + public static void registerSparkFunctionUDF1(SparkSession sparkSession, String functionName, UDF1 udf1, DataType dataType) { + + //only register udf1 if it is not registered already + if (!sparkSession.catalog().functionExists(functionName)) { + sparkSession.udf().register(functionName, udf1, dataType); + } + } + + public static void registerSparkFunctionUDF2(SparkSession sparkSession, String functionName, UDF2 udf2, DataType dataType) { + + //only register udf2 if it is not registered already + if (!sparkSession.catalog().functionExists(functionName)) { + sparkSession.udf().register(functionName, udf2, dataType); + } + } +} diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java index 9d786a16d..bd9812188 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java @@ -22,10 +22,7 @@ public SparkHashUtil(SparkSession spark) { public HashFunction, Row, Column,DataType> registerHashFunction(HashFnFromConf scriptArg) { HashFunction, Row, Column,DataType> fn = new SparkHashFunctionRegistry().getFunction(scriptArg.getName()); - //register udf only if it is not registered already - if (!getSessionObj().catalog().functionExists(fn.getName())) { - getSessionObj().udf().register(fn.getName(), (UDF1) fn, fn.getReturnType()); - } + SparkFnRegistrar.registerSparkFunctionUDF1(getSessionObj(), fn.getName(), (UDF1) fn, fn.getReturnType()); return fn; } diff --git a/spark/core/src/test/java/zingg/TestImageType.java b/spark/core/src/test/java/zingg/TestImageType.java index d96dd7313..ff99b22c5 100644 --- a/spark/core/src/test/java/zingg/TestImageType.java +++ b/spark/core/src/test/java/zingg/TestImageType.java @@ -19,6 +19,7 @@ import zingg.common.core.similarity.function.ArrayDoubleSimilarityFunction; import zingg.spark.core.executor.ZinggSparkTester; +import zingg.spark.core.util.SparkFnRegistrar; public class TestImageType extends ZinggSparkTester{ @@ -90,7 +91,7 @@ public void testUDFArray() { df.printSchema(); // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleArr testUDFDoubleArr = new TestUDFDoubleArr(); - spark.udf().register("testUDFDoubleArr", testUDFDoubleArr, DataTypes.DoubleType); + SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleArr", testUDFDoubleArr, DataTypes.DoubleType); // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleArr", df.col("image_embedding"), df.col("image_embedding"))); @@ -116,8 +117,8 @@ public void testUDFList() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleList testUDFDoubleList = new TestUDFDoubleList(); - spark.udf().register("testUDFDoubleList", testUDFDoubleList, DataTypes.DoubleType); - + SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleList", testUDFDoubleList, DataTypes.DoubleType); + // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleList",df.col("image_embedding"),df.col("image_embedding"))); // see if error is reproduced @@ -142,8 +143,8 @@ public void testUDFSeq() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleSeq testUDFDoubleSeq = new TestUDFDoubleSeq(); - spark.udf().register("testUDFDoubleSeq", testUDFDoubleSeq, DataTypes.DoubleType); - + SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleSeq", testUDFDoubleSeq, DataTypes.DoubleType); + // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleSeq",df.col("image_embedding"),df.col("image_embedding"))); // see if error is reproduced @@ -168,8 +169,8 @@ public void testUDFWrappedArr() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleWrappedArr testUDFDoubleWrappedArr = new TestUDFDoubleWrappedArr(); - spark.udf().register("testUDFDoubleWrappedArr", testUDFDoubleWrappedArr, DataTypes.DoubleType); - + SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleWrappedArr", testUDFDoubleWrappedArr, DataTypes.DoubleType); + // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleWrappedArr",df.col("image_embedding"),df.col("image_embedding"))); // see if error is reproduced @@ -197,8 +198,8 @@ public void testUDFObj() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleObj testUDFDoubleObj = new TestUDFDoubleObj(); - spark.udf().register("testUDFDoubleObj", testUDFDoubleObj, DataTypes.DoubleType); - + SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleObj", testUDFDoubleObj, DataTypes.DoubleType); + // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleObj",df.col("image_embedding"),df.col("image_embedding"))); // see if error is reproduced From a9026b00eb3d0dd4bccdecfb655840b57afd7b03 Mon Sep 17 00:00:00 2001 From: administrator Date: Thu, 11 Jul 2024 18:09:45 +0530 Subject: [PATCH 192/219] method name changed to registerUDF1 and registerUDF2 --- .../zingg/spark/core/model/VectorValueExtractor.java | 2 +- .../spark/core/preprocess/SparkStopWordsRemover.java | 2 +- .../zingg/spark/core/similarity/SparkTransformer.java | 3 +-- .../java/zingg/spark/core/util/SparkFnRegistrar.java | 4 ++-- .../main/java/zingg/spark/core/util/SparkHashUtil.java | 2 +- spark/core/src/test/java/zingg/TestImageType.java | 10 +++++----- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java index ca5251c5e..e842386c5 100644 --- a/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java +++ b/spark/core/src/main/java/zingg/spark/core/model/VectorValueExtractor.java @@ -28,7 +28,7 @@ public Double call(Vector v) { @Override public void register(SparkSession spark) { - SparkFnRegistrar.registerSparkFunctionUDF1(spark, uid, this, DataTypes.DoubleType); + SparkFnRegistrar.registerUDF1(spark, uid, this, DataTypes.DoubleType); } /*@Override diff --git a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java index b02a0f4c1..860e66b7e 100644 --- a/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java +++ b/spark/core/src/main/java/zingg/spark/core/preprocess/SparkStopWordsRemover.java @@ -48,7 +48,7 @@ protected String registerUDF() { // register the UDF SparkSession zSession = getContext().getSession(); - SparkFnRegistrar.registerSparkFunctionUDF2(zSession, udfName, removeStopWordsUDF, DataTypes.StringType); + SparkFnRegistrar.registerUDF2(zSession, udfName, removeStopWordsUDF, DataTypes.StringType); return udfName; } diff --git a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java index f59bfcafa..f477067d6 100644 --- a/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java +++ b/spark/core/src/main/java/zingg/spark/core/similarity/SparkTransformer.java @@ -2,7 +2,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.api.java.UDF2; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.SparkSession; @@ -28,7 +27,7 @@ public SparkTransformer(String inputCol, SparkSimFunction function, String outpu public void register(SparkSession spark) { - SparkFnRegistrar.registerSparkFunctionUDF2(spark, getUid(), function, DataTypes.DoubleType); + SparkFnRegistrar.registerUDF2(spark, getUid(), function, DataTypes.DoubleType); } diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java b/spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java index 130a94054..792d13b06 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkFnRegistrar.java @@ -7,7 +7,7 @@ public class SparkFnRegistrar { - public static void registerSparkFunctionUDF1(SparkSession sparkSession, String functionName, UDF1 udf1, DataType dataType) { + public static void registerUDF1(SparkSession sparkSession, String functionName, UDF1 udf1, DataType dataType) { //only register udf1 if it is not registered already if (!sparkSession.catalog().functionExists(functionName)) { @@ -15,7 +15,7 @@ public static void registerSparkFunctionUDF1(SparkSession sparkSession, String f } } - public static void registerSparkFunctionUDF2(SparkSession sparkSession, String functionName, UDF2 udf2, DataType dataType) { + public static void registerUDF2(SparkSession sparkSession, String functionName, UDF2 udf2, DataType dataType) { //only register udf2 if it is not registered already if (!sparkSession.catalog().functionExists(functionName)) { diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java index bd9812188..6096f9ecc 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkHashUtil.java @@ -22,7 +22,7 @@ public SparkHashUtil(SparkSession spark) { public HashFunction, Row, Column,DataType> registerHashFunction(HashFnFromConf scriptArg) { HashFunction, Row, Column,DataType> fn = new SparkHashFunctionRegistry().getFunction(scriptArg.getName()); - SparkFnRegistrar.registerSparkFunctionUDF1(getSessionObj(), fn.getName(), (UDF1) fn, fn.getReturnType()); + SparkFnRegistrar.registerUDF1(getSessionObj(), fn.getName(), (UDF1) fn, fn.getReturnType()); return fn; } diff --git a/spark/core/src/test/java/zingg/TestImageType.java b/spark/core/src/test/java/zingg/TestImageType.java index ff99b22c5..bb005a7b2 100644 --- a/spark/core/src/test/java/zingg/TestImageType.java +++ b/spark/core/src/test/java/zingg/TestImageType.java @@ -91,7 +91,7 @@ public void testUDFArray() { df.printSchema(); // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleArr testUDFDoubleArr = new TestUDFDoubleArr(); - SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleArr", testUDFDoubleArr, DataTypes.DoubleType); + SparkFnRegistrar.registerUDF2(spark, "testUDFDoubleArr", testUDFDoubleArr, DataTypes.DoubleType); // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleArr", df.col("image_embedding"), df.col("image_embedding"))); @@ -117,7 +117,7 @@ public void testUDFList() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleList testUDFDoubleList = new TestUDFDoubleList(); - SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleList", testUDFDoubleList, DataTypes.DoubleType); + SparkFnRegistrar.registerUDF2(spark, "testUDFDoubleList", testUDFDoubleList, DataTypes.DoubleType); // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleList",df.col("image_embedding"),df.col("image_embedding"))); @@ -143,7 +143,7 @@ public void testUDFSeq() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleSeq testUDFDoubleSeq = new TestUDFDoubleSeq(); - SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleSeq", testUDFDoubleSeq, DataTypes.DoubleType); + SparkFnRegistrar.registerUDF2(spark, "testUDFDoubleSeq", testUDFDoubleSeq, DataTypes.DoubleType); // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleSeq",df.col("image_embedding"),df.col("image_embedding"))); @@ -169,7 +169,7 @@ public void testUDFWrappedArr() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleWrappedArr testUDFDoubleWrappedArr = new TestUDFDoubleWrappedArr(); - SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleWrappedArr", testUDFDoubleWrappedArr, DataTypes.DoubleType); + SparkFnRegistrar.registerUDF2(spark, "testUDFDoubleWrappedArr", testUDFDoubleWrappedArr, DataTypes.DoubleType); // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleWrappedArr",df.col("image_embedding"),df.col("image_embedding"))); @@ -198,7 +198,7 @@ public void testUDFObj() { // register ArrayDoubleSimilarityFunction as a UDF TestUDFDoubleObj testUDFDoubleObj = new TestUDFDoubleObj(); - SparkFnRegistrar.registerSparkFunctionUDF2(spark, "testUDFDoubleObj", testUDFDoubleObj, DataTypes.DoubleType); + SparkFnRegistrar.registerUDF2(spark, "testUDFDoubleObj", testUDFDoubleObj, DataTypes.DoubleType); // call the UDF from select clause of DF df = df.withColumn("cosine", callUDF("testUDFDoubleObj",df.col("image_embedding"),df.col("image_embedding"))); From bed992461210708ea4c74d320ef01b663731c29f Mon Sep 17 00:00:00 2001 From: administrator Date: Sat, 13 Jul 2024 16:02:43 +0530 Subject: [PATCH 193/219] Renamed classes and import refactors --- .../common/client/util/DFObjectUtil.java | 6 ++++ .../zingg/common/client/TestZFrameBase.java | 18 +++++------ .../data/{Constant.java => TestData.java} | 2 +- .../common/core/block/TestBlockBase.java | 3 +- .../infra/util/PojoToArrayConverter.java | 32 ------------------- .../spark/client/util/SparkDFObjectUtil.java | 5 ++- .../spark/client/util/WithSparkSession.java | 19 ----------- .../java/zingg/client/TestSparkFrame.java | 10 +++--- 8 files changed, 25 insertions(+), 70 deletions(-) rename common/client/src/test/java/zingg/common/client/data/{Constant.java => TestData.java} (99%) delete mode 100644 common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java delete mode 100644 spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java diff --git a/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java index d277074b7..c0ae8bd89 100644 --- a/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java +++ b/common/client/src/main/java/zingg/common/client/util/DFObjectUtil.java @@ -6,6 +6,12 @@ public abstract class DFObjectUtil { + protected final IWithSession iWithSession; + + protected DFObjectUtil(IWithSession iWithSession) { + this.iWithSession = iWithSession; + } + public abstract ZFrame getDFFromObjectList(List objList, Class objClass) throws Exception; } diff --git a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java index 7cdac1a9b..0735f95c3 100644 --- a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java +++ b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java @@ -22,15 +22,15 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -import static zingg.common.client.data.Constant.createEmptySampleData; -import static zingg.common.client.data.Constant.createSampleDataCluster; -import static zingg.common.client.data.Constant.createSampleDataClusterWithNull; -import static zingg.common.client.data.Constant.createSampleDataInput; -import static zingg.common.client.data.Constant.createSampleDataList; -import static zingg.common.client.data.Constant.createSampleDataListDistinct; -import static zingg.common.client.data.Constant.createSampleDataListWithDistinctSurnameAndPostcode; -import static zingg.common.client.data.Constant.createSampleDataListWithMixedDataType; -import static zingg.common.client.data.Constant.createSampleDataZScore; +import static zingg.common.client.data.TestData.createEmptySampleData; +import static zingg.common.client.data.TestData.createSampleDataCluster; +import static zingg.common.client.data.TestData.createSampleDataClusterWithNull; +import static zingg.common.client.data.TestData.createSampleDataInput; +import static zingg.common.client.data.TestData.createSampleDataList; +import static zingg.common.client.data.TestData.createSampleDataListDistinct; +import static zingg.common.client.data.TestData.createSampleDataListWithDistinctSurnameAndPostcode; +import static zingg.common.client.data.TestData.createSampleDataListWithMixedDataType; +import static zingg.common.client.data.TestData.createSampleDataZScore; public abstract class TestZFrameBase { diff --git a/common/client/src/test/java/zingg/common/client/data/Constant.java b/common/client/src/test/java/zingg/common/client/data/TestData.java similarity index 99% rename from common/client/src/test/java/zingg/common/client/data/Constant.java rename to common/client/src/test/java/zingg/common/client/data/TestData.java index e84f6c0fb..78a09dbb0 100644 --- a/common/client/src/test/java/zingg/common/client/data/Constant.java +++ b/common/client/src/test/java/zingg/common/client/data/TestData.java @@ -11,7 +11,7 @@ import java.util.ArrayList; import java.util.List; -public class Constant { +public class TestData { //sample data classes to be used for testing public static List createEmptySampleData() { diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 87d340171..66b2711a0 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -52,7 +52,8 @@ public void testTree() throws Throwable { } private IArguments getArguments() throws ZinggClientException { - + String configFilePath = Objects.requireNonNull(getClass().getResource("../../../../testFebrl/config.json")).getFile(); + IArguments args = argumentsUtil.createArgumentsFromJSON(configFilePath, "trainMatch"); List fdList = getFieldDefList(); diff --git a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java b/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java deleted file mode 100644 index a519cfe1f..000000000 --- a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java +++ /dev/null @@ -1,32 +0,0 @@ -package zingg.common.infra.util; - -import java.lang.reflect.*; -import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; -import java.util.List; - -public class PojoToArrayConverter { - - public static Object[] getObjectArray(T person) throws NoSuchAlgorithmException, IllegalArgumentException, IllegalAccessException { - - List values = new ArrayList(); - if (person.getClass().getSuperclass() != null) { - Field[] fieldsSuper = person.getClass().getSuperclass().getDeclaredFields(); - if (fieldsSuper != null){ - for (Field f: fieldsSuper) { - f.setAccessible(true); - values.add(f.get(person)); - } - - } - } - Field[] fields = person.getClass().getDeclaredFields(); - - for (Field field: fields) { - field.setAccessible(true); - values.add(field.get(person)); - } - - return values.toArray(new Object[values.size()]); - } -} diff --git a/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java index 840dc9944..f5d185ae3 100644 --- a/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java +++ b/spark/client/src/main/java/zingg/spark/client/util/SparkDFObjectUtil.java @@ -16,10 +16,9 @@ public class SparkDFObjectUtil extends DFObjectUtil, Row, Column> { - private final IWithSession withSparkSession; public SparkDFObjectUtil(IWithSession withSparkSession) { - this.withSparkSession = withSparkSession; + super(withSparkSession); } @Override @@ -30,7 +29,7 @@ public ZFrame, Row, Column> getDFFromObjectList(List objList, Class List rows = Arrays.asList(RowsFromObjectList.getRows(objList)); StructType structType = stpc.getStructType(objClass); - return new SparkFrame(withSparkSession.getSession().createDataFrame(rows, structType)); + return new SparkFrame(iWithSession.getSession().createDataFrame(rows, structType)); } } diff --git a/spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java b/spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java deleted file mode 100644 index 327eca0d4..000000000 --- a/spark/client/src/main/java/zingg/spark/client/util/WithSparkSession.java +++ /dev/null @@ -1,19 +0,0 @@ -package zingg.spark.client.util; - -import org.apache.spark.sql.SparkSession; -import zingg.common.client.util.WithSession; - -public class WithSparkSession implements WithSession { - - private SparkSession sparkSession; - - @Override - public void setSession(SparkSession sparkSession) { - this.sparkSession = sparkSession; - } - - @Override - public SparkSession getSession() { - return this.sparkSession; - } -} diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index f2cd6df2e..ab496a77e 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -20,10 +20,10 @@ import zingg.common.client.IArguments; import zingg.common.client.TestZFrameBase; import zingg.common.client.ZFrame; +import zingg.common.client.util.IWithSession; import zingg.common.client.util.WithSession; import zingg.spark.client.SparkFrame; import zingg.spark.client.util.SparkDFObjectUtil; -import zingg.spark.client.util.WithSparkSession; import java.util.Arrays; @@ -34,10 +34,10 @@ public class TestSparkFrame extends TestZFrameBase, R public static IArguments args; public static JavaSparkContext ctx; public static SparkSession spark; - public static WithSession withSession; + public static IWithSession iWithSession; public TestSparkFrame() { - super(new SparkDFObjectUtil(withSession)); + super(new SparkDFObjectUtil(iWithSession)); } @BeforeAll @@ -53,8 +53,8 @@ protected static void setUpSpark() { .appName("Zingg" + "Junit") .getOrCreate(); ctx = new JavaSparkContext(spark.sparkContext()); - withSession = new WithSparkSession(); - withSession.setSession(spark); + iWithSession = new WithSession<>(); + iWithSession.setSession(spark); } catch (Throwable e) { if (LOG.isDebugEnabled()) e.printStackTrace(); From ae8cfea3cee748b549a50a37c5674b2cec731011 Mon Sep 17 00:00:00 2001 From: administrator Date: Sat, 13 Jul 2024 16:19:46 +0530 Subject: [PATCH 194/219] renamed classes --- .../zingg/common/client/TestZFrameBase.java | 30 +++++----- .../zingg/common/client/data/TestData.java | 60 +++++++++---------- ...Source.java => InputWithZidAndSource.java} | 4 +- .../{ClusterPairOne.java => PairPartOne.java} | 4 +- .../{ClusterPairTwo.java => PairPartTwo.java} | 4 +- 5 files changed, 51 insertions(+), 51 deletions(-) rename common/client/src/test/java/zingg/common/client/model/{ClusterSource.java => InputWithZidAndSource.java} (66%) rename common/client/src/test/java/zingg/common/client/model/{ClusterPairOne.java => PairPartOne.java} (71%) rename common/client/src/test/java/zingg/common/client/model/{ClusterPairTwo.java => PairPartTwo.java} (71%) diff --git a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java index 0735f95c3..beb376aee 100644 --- a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java +++ b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java @@ -10,9 +10,9 @@ import zingg.common.client.model.Person; import zingg.common.client.model.PersonMixed; import zingg.common.client.model.ClusterZScore; -import zingg.common.client.model.ClusterSource; -import zingg.common.client.model.ClusterPairOne; -import zingg.common.client.model.ClusterPairTwo; +import zingg.common.client.model.InputWithZidAndSource; +import zingg.common.client.model.PairPartOne; +import zingg.common.client.model.PairPartTwo; import java.lang.reflect.Field; import java.util.ArrayList; @@ -422,10 +422,10 @@ public void testGroupByMinMax2() throws Exception { @Test public void testRightJoinMultiCol() throws Exception { - List sampleDataSetInput = createSampleDataInput(); //List - ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, ClusterSource.class); - List sampleDataSetCluster = createSampleDataCluster(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairOne.class); + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, InputWithZidAndSource.class); + List sampleDataSetCluster = createSampleDataCluster(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, PairPartOne.class); ZFrame joinedData = zFrameCluster.join(zFrameInput, ColName.ID_COL, ColName.SOURCE_COL, ZFrame.RIGHT_JOIN); assertEquals(10, joinedData.count()); @@ -433,18 +433,18 @@ public void testRightJoinMultiCol() throws Exception { @Test public void testFilterInCond() throws Exception { - List sampleDataSetInput = createSampleDataInput(); //List - ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, ClusterSource.class); - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairTwo.class); + List sampleDataSetInput = createSampleDataInput(); //List + ZFrame zFrameInput = dfObjectUtil.getDFFromObjectList(sampleDataSetInput, InputWithZidAndSource.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, PairPartTwo.class); ZFrame filteredData = zFrameInput.filterInCond(ColName.ID_COL, zFrameCluster, ColName.COL_PREFIX + ColName.ID_COL); assertEquals(5, filteredData.count()); } @Test public void testFilterNotNullCond() throws Exception { - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairTwo.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, PairPartTwo.class); ZFrame filteredData = zFrameCluster.filterNotNullCond(ColName.SOURCE_COL); assertEquals(3, filteredData.count()); @@ -452,8 +452,8 @@ public void testFilterNotNullCond() throws Exception { @Test public void testFilterNullCond() throws Exception { - List sampleDataSetCluster = createSampleDataClusterWithNull(); //List - ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, ClusterPairTwo.class); + List sampleDataSetCluster = createSampleDataClusterWithNull(); //List + ZFrame zFrameCluster = dfObjectUtil.getDFFromObjectList(sampleDataSetCluster, PairPartTwo.class); ZFrame filteredData = zFrameCluster.filterNullCond(ColName.SOURCE_COL); assertEquals(2, filteredData.count()); diff --git a/common/client/src/test/java/zingg/common/client/data/TestData.java b/common/client/src/test/java/zingg/common/client/data/TestData.java index 78a09dbb0..6c915617c 100644 --- a/common/client/src/test/java/zingg/common/client/data/TestData.java +++ b/common/client/src/test/java/zingg/common/client/data/TestData.java @@ -2,9 +2,9 @@ import zingg.common.client.model.Person; -import zingg.common.client.model.ClusterPairOne; -import zingg.common.client.model.ClusterPairTwo; -import zingg.common.client.model.ClusterSource; +import zingg.common.client.model.PairPartOne; +import zingg.common.client.model.PairPartTwo; +import zingg.common.client.model.InputWithZidAndSource; import zingg.common.client.model.PersonMixed; import zingg.common.client.model.ClusterZScore; @@ -105,43 +105,43 @@ public static List createSampleDataZScore() { return sample; } - public static List createSampleDataCluster() { + public static List createSampleDataCluster() { - List sample = new ArrayList<>(); - sample.add(new ClusterPairOne(1L, "100", 1001.0, "b")); - sample.add(new ClusterPairOne(2L, "100", 1002.0, "a")); - sample.add(new ClusterPairOne(3L, "100", 2001.0, "b")); - sample.add(new ClusterPairOne(4L, "900", 2002.0, "c")); - sample.add(new ClusterPairOne(5L, "111", 9002.0, "c")); + List sample = new ArrayList<>(); + sample.add(new PairPartOne(1L, "100", 1001.0, "b")); + sample.add(new PairPartOne(2L, "100", 1002.0, "a")); + sample.add(new PairPartOne(3L, "100", 2001.0, "b")); + sample.add(new PairPartOne(4L, "900", 2002.0, "c")); + sample.add(new PairPartOne(5L, "111", 9002.0, "c")); return sample; } - public static List createSampleDataClusterWithNull() { + public static List createSampleDataClusterWithNull() { - List sample = new ArrayList<>(); - sample.add(new ClusterPairTwo(1L, "100", 1001.0, "b")); - sample.add(new ClusterPairTwo(2L, "100", 1002.0, "a")); - sample.add(new ClusterPairTwo(3L, "100", 2001.0, null)); - sample.add(new ClusterPairTwo(4L, "900", 2002.0, "c")); - sample.add(new ClusterPairTwo(5L, "111", 9002.0, null)); + List sample = new ArrayList<>(); + sample.add(new PairPartTwo(1L, "100", 1001.0, "b")); + sample.add(new PairPartTwo(2L, "100", 1002.0, "a")); + sample.add(new PairPartTwo(3L, "100", 2001.0, null)); + sample.add(new PairPartTwo(4L, "900", 2002.0, "c")); + sample.add(new PairPartTwo(5L, "111", 9002.0, null)); return sample; } - public static List createSampleDataInput() { - - List sample = new ArrayList<>(); - sample.add(new ClusterSource(1L, "fname1", "b")); - sample.add(new ClusterSource(2L, "fname", "a")); - sample.add(new ClusterSource(3L, "fna", "b")); - sample.add((new ClusterSource(4L, "x", "c"))); - sample.add(new ClusterSource(5L, "y", "c")); - sample.add(new ClusterSource(11L, "new1", "b")); - sample.add(new ClusterSource(22L, "new12", "a")); - sample.add(new ClusterSource(33L, "new13", "b")); - sample.add(new ClusterSource(44L, "new14", "c")); - sample.add(new ClusterSource(55L, "new15", "c")); + public static List createSampleDataInput() { + + List sample = new ArrayList<>(); + sample.add(new InputWithZidAndSource(1L, "fname1", "b")); + sample.add(new InputWithZidAndSource(2L, "fname", "a")); + sample.add(new InputWithZidAndSource(3L, "fna", "b")); + sample.add((new InputWithZidAndSource(4L, "x", "c"))); + sample.add(new InputWithZidAndSource(5L, "y", "c")); + sample.add(new InputWithZidAndSource(11L, "new1", "b")); + sample.add(new InputWithZidAndSource(22L, "new12", "a")); + sample.add(new InputWithZidAndSource(33L, "new13", "b")); + sample.add(new InputWithZidAndSource(44L, "new14", "c")); + sample.add(new InputWithZidAndSource(55L, "new15", "c")); return sample; } diff --git a/common/client/src/test/java/zingg/common/client/model/ClusterSource.java b/common/client/src/test/java/zingg/common/client/model/InputWithZidAndSource.java similarity index 66% rename from common/client/src/test/java/zingg/common/client/model/ClusterSource.java rename to common/client/src/test/java/zingg/common/client/model/InputWithZidAndSource.java index e21727258..78e6f4418 100644 --- a/common/client/src/test/java/zingg/common/client/model/ClusterSource.java +++ b/common/client/src/test/java/zingg/common/client/model/InputWithZidAndSource.java @@ -1,11 +1,11 @@ package zingg.common.client.model; -public class ClusterSource { +public class InputWithZidAndSource { public final Long z_zid; public final String fname; public final String z_zsource; - public ClusterSource(Long z_zid, String fname, String z_zsource) { + public InputWithZidAndSource(Long z_zid, String fname, String z_zsource) { this.z_zid = z_zid; this.fname = fname; this.z_zsource = z_zsource; diff --git a/common/client/src/test/java/zingg/common/client/model/ClusterPairOne.java b/common/client/src/test/java/zingg/common/client/model/PairPartOne.java similarity index 71% rename from common/client/src/test/java/zingg/common/client/model/ClusterPairOne.java rename to common/client/src/test/java/zingg/common/client/model/PairPartOne.java index 6de7c7ff8..e26dd1fd7 100644 --- a/common/client/src/test/java/zingg/common/client/model/ClusterPairOne.java +++ b/common/client/src/test/java/zingg/common/client/model/PairPartOne.java @@ -1,12 +1,12 @@ package zingg.common.client.model; -public class ClusterPairOne { +public class PairPartOne { public final Long z_zid; public final String z_cluster; public final Double z_score; public final String z_zsource; - public ClusterPairOne(Long z_zid, String z_cluster, Double z_score, String z_zsource) { + public PairPartOne(Long z_zid, String z_cluster, Double z_score, String z_zsource) { this.z_zid = z_zid; this.z_cluster = z_cluster; this.z_score = z_score; diff --git a/common/client/src/test/java/zingg/common/client/model/ClusterPairTwo.java b/common/client/src/test/java/zingg/common/client/model/PairPartTwo.java similarity index 71% rename from common/client/src/test/java/zingg/common/client/model/ClusterPairTwo.java rename to common/client/src/test/java/zingg/common/client/model/PairPartTwo.java index 73935df20..6fb68c9c2 100644 --- a/common/client/src/test/java/zingg/common/client/model/ClusterPairTwo.java +++ b/common/client/src/test/java/zingg/common/client/model/PairPartTwo.java @@ -1,12 +1,12 @@ package zingg.common.client.model; -public class ClusterPairTwo { +public class PairPartTwo { public final Long z_z_zid; public final String z_cluster; public final Double z_score; public final String z_zsource; - public ClusterPairTwo(Long z_z_zid, String z_cluster, Double z_score, String z_zsource) { + public PairPartTwo(Long z_z_zid, String z_cluster, Double z_score, String z_zsource) { this.z_z_zid = z_z_zid; this.z_cluster = z_cluster; this.z_score = z_score; From 6abd12160a7b3d3c9b16250fafd193b7a0c23af6 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Wed, 3 Jul 2024 21:37:32 +0530 Subject: [PATCH 195/219] zframe repartition --- common/client/src/main/java/zingg/common/client/ZFrame.java | 1 + spark/client/src/main/java/zingg/spark/client/SparkFrame.java | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/common/client/src/main/java/zingg/common/client/ZFrame.java b/common/client/src/main/java/zingg/common/client/ZFrame.java index 37de2b0db..b07a264c0 100644 --- a/common/client/src/main/java/zingg/common/client/ZFrame.java +++ b/common/client/src/main/java/zingg/common/client/ZFrame.java @@ -82,6 +82,7 @@ public interface ZFrame { public ZFrame repartition(int num); public ZFrame repartition(int num, C c); public ZFrame repartition(int num,scala.collection.Seq partitionExprs); + public ZFrame repartition(scala.collection.Seq partitionExprs); public ZFrame sample(boolean repartition, float num); diff --git a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java index c2984868a..7add25352 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java @@ -223,6 +223,10 @@ public ZFrame, Row, Column> repartition(int num,scala.collection.Se return new SparkFrame(df.repartition(num, partitionExprs)); } + public ZFrame, Row, Column> repartition(scala.collection.Seq partitionExprs){ + return new SparkFrame(df.repartition(partitionExprs)); + } + @Override public Column gt(String c) { return gt(this,c); From 3de35da6fba7687af5a64197c2aa71923be9cfea Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Tue, 16 Jul 2024 13:00:14 +0530 Subject: [PATCH 196/219] csv reader and ifromcsv --- .../zingg/common/core/util/CsvReader.java | 28 +++++++++++++++++++ .../java/zingg/common/core/util/IFromCsv.java | 7 +++++ .../core}/util/PojoToArrayConverter.java | 2 +- 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 common/core/src/test/java/zingg/common/core/util/CsvReader.java create mode 100644 common/core/src/test/java/zingg/common/core/util/IFromCsv.java rename common/{infra/src/main/java/zingg/common/infra => core/src/test/java/zingg/common/core}/util/PojoToArrayConverter.java (96%) diff --git a/common/core/src/test/java/zingg/common/core/util/CsvReader.java b/common/core/src/test/java/zingg/common/core/util/CsvReader.java new file mode 100644 index 000000000..c700d6fe2 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/util/CsvReader.java @@ -0,0 +1,28 @@ +package zingg.common.core.util; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; +import java.util.Scanner; + +public class CsvReader { + protected List records; + IFromCsv creator; + + public CsvReader(IFromCsv creator){ + records = new ArrayList(); + this.creator = creator; + } + + public List getRecords(String file, boolean skipHeader) throws FileNotFoundException{ + int lineno = 0; + try (Scanner scanner = new Scanner(new File(file))) { + while (scanner.hasNextLine()) { + records.add(creator.fromCsv(scanner.nextLine())); + } + } + return records; + } + +} diff --git a/common/core/src/test/java/zingg/common/core/util/IFromCsv.java b/common/core/src/test/java/zingg/common/core/util/IFromCsv.java new file mode 100644 index 000000000..574da836b --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/util/IFromCsv.java @@ -0,0 +1,7 @@ +package zingg.common.core.util; + +public interface IFromCsv { + + C fromCsv(String s); + +} diff --git a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java b/common/core/src/test/java/zingg/common/core/util/PojoToArrayConverter.java similarity index 96% rename from common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java rename to common/core/src/test/java/zingg/common/core/util/PojoToArrayConverter.java index a519cfe1f..e1a0ccf80 100644 --- a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java +++ b/common/core/src/test/java/zingg/common/core/util/PojoToArrayConverter.java @@ -1,4 +1,4 @@ -package zingg.common.infra.util; +package zingg.common.core.util; import java.lang.reflect.*; import java.security.NoSuchAlgorithmException; From 2416dc0e28c253f46c756a974af54aa09e42288e Mon Sep 17 00:00:00 2001 From: sania-16 Date: Thu, 18 Jul 2024 13:43:10 +0530 Subject: [PATCH 197/219] new class for toString() --- .../java/zingg/common/client/Arguments.java | 14 ++------- .../common/client/util/JsonStringify.java | 27 ++++++++++++++++ .../zingg/common/client/TestArguments.java | 31 +++++++++++++++++-- 3 files changed, 57 insertions(+), 15 deletions(-) create mode 100644 common/client/src/main/java/zingg/common/client/util/JsonStringify.java diff --git a/common/client/src/main/java/zingg/common/client/Arguments.java b/common/client/src/main/java/zingg/common/client/Arguments.java index 3f396f090..4ec0bda44 100644 --- a/common/client/src/main/java/zingg/common/client/Arguments.java +++ b/common/client/src/main/java/zingg/common/client/Arguments.java @@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import zingg.common.client.pipe.Pipe; +import zingg.common.client.util.JsonStringify; /** @@ -309,18 +310,7 @@ public void checkNullBlankEmpty(Pipe[] field, String fieldName) throws ZinggClie @Override public String toString() { - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, - true); - //mapper.configure(JsonParser.Feature.FAIL_ON_EMPTY_BEANS, true) - try { - StringWriter writer = new StringWriter(); - return mapper.writeValueAsString(this); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - return null; - } + return JsonStringify.toString(this); } /** diff --git a/common/client/src/main/java/zingg/common/client/util/JsonStringify.java b/common/client/src/main/java/zingg/common/client/util/JsonStringify.java new file mode 100644 index 000000000..848155e83 --- /dev/null +++ b/common/client/src/main/java/zingg/common/client/util/JsonStringify.java @@ -0,0 +1,27 @@ +package zingg.common.client.util; + +import java.io.IOException; +import java.io.StringWriter; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.ObjectMapper; + +import zingg.common.client.Arguments; +import zingg.common.client.ArgumentsUtil; + +public class JsonStringify { + public static String toString (Object o){ + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, true); + //mapper.configure(JsonParser.Feature.FAIL_ON_EMPTY_BEANS, true) + try { + StringWriter writer = new StringWriter(); + return mapper.writeValueAsString(o); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return null; + } + } + +} diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java index 290bac5d7..0a75b4d2d 100644 --- a/common/client/src/test/java/zingg/common/client/TestArguments.java +++ b/common/client/src/test/java/zingg/common/client/TestArguments.java @@ -244,8 +244,33 @@ public void testMatchTypeWrong() { } - - - + + @Test + public void testJsonStringify(){ + IArguments argsFromJsonFile; + try{ + //Converting to JSON using toString() + argsFromJsonFile = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configWithMultipleMatchTypesUnsupported.json").getFile(), "test"); + String strFromJsonFile = argsFromJsonFile.toString(); + + IArguments argsFullCycle = argsUtil.createArgumentsFromJSONString(strFromJsonFile, ""); + + assertEquals(argsFullCycle.getFieldDefinition().get(0), argsFromJsonFile.getFieldDefinition().get(0)); + assertEquals(argsFullCycle.getFieldDefinition().get(2), argsFromJsonFile.getFieldDefinition().get(2)); + assertEquals(argsFullCycle.getModelId(), argsFromJsonFile.getModelId()); + assertEquals(argsFullCycle.getZinggModelDir(), argsFromJsonFile.getZinggModelDir()); + assertEquals(argsFullCycle.getNumPartitions(), argsFromJsonFile.getNumPartitions()); + assertEquals(argsFullCycle.getLabelDataSampleSize() ,argsFromJsonFile.getLabelDataSampleSize()); + assertEquals(argsFullCycle.getTrainingSamples(),argsFromJsonFile.getTrainingSamples()); + assertEquals(argsFullCycle.getOutput(),argsFromJsonFile.getOutput()); + assertEquals(argsFullCycle.getData(),argsFromJsonFile.getData()); + assertEquals(argsFullCycle.getZinggDir(),argsFromJsonFile.getZinggDir()); + assertEquals(argsFullCycle.getJobId(),argsFromJsonFile.getJobId()); + + } catch (Exception | ZinggClientException e) { + e.printStackTrace(); + } + + } } From a077c3ba55d0ff6ee89007478c04b603df107ff3 Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Mon, 22 Jul 2024 11:04:54 +0530 Subject: [PATCH 198/219] spark caching and perf improvement --- common/client/src/main/java/zingg/common/client/Samples.java | 4 ++++ .../src/main/java/zingg/common/core/executor/Matcher.java | 2 +- .../main/java/zingg/common/core/pairs/SelfPairBuilder.java | 2 +- .../src/main/java/zingg/spark/core/util/SparkGraphUtil.java | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/Samples.java b/common/client/src/main/java/zingg/common/client/Samples.java index 1a74c3874..c93fa249a 100644 --- a/common/client/src/main/java/zingg/common/client/Samples.java +++ b/common/client/src/main/java/zingg/common/client/Samples.java @@ -1,3 +1,7 @@ + + + + package zingg.common.client; import java.io.Serializable; diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java index 2e976eae9..88a16cd10 100644 --- a/common/core/src/main/java/zingg/common/core/executor/Matcher.java +++ b/common/core/src/main/java/zingg/common/core/executor/Matcher.java @@ -49,7 +49,7 @@ public ZFrame getBlocked( ZFrame testData) throws Exception, Zin LOG.debug("Blocking model file location is " + args.getBlockFile()); Tree> tree = getBlockingTreeUtil().readBlockingTree(args); ZFrame blocked = getBlockingTreeUtil().getBlockHashes(testData, tree); - ZFrame blocked1 = blocked.repartition(args.getNumPartitions(), blocked.col(ColName.HASH_COL)); //.cache(); + ZFrame blocked1 = blocked.repartition(args.getNumPartitions(), blocked.col(ColName.HASH_COL)).cache(); return blocked1; } diff --git a/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java b/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java index 4d0fff71d..2e9e261db 100644 --- a/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java +++ b/common/core/src/main/java/zingg/common/core/pairs/SelfPairBuilder.java @@ -27,7 +27,7 @@ public ZFrame getPairs(ZFrameblocked, ZFramebAll) throws */ //joinH.show(); joinH = joinH.filter(joinH.gt(ColName.ID_COL)); - LOG.warn("Num comparisons " + joinH.count()); + if (LOG.isDebugEnabled()) LOG.debug("Num comparisons " + joinH.count()); joinH = joinH.repartition(args.getNumPartitions(), joinH.col(ColName.ID_COL)); bAll = bAll.repartition(args.getNumPartitions(), bAll.col(ColName.ID_COL)); joinH = joinH.joinOnCol(bAll, ColName.ID_COL); diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkGraphUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkGraphUtil.java index 44a8ac240..8a885c751 100644 --- a/spark/core/src/main/java/zingg/spark/core/util/SparkGraphUtil.java +++ b/spark/core/src/main/java/zingg/spark/core/util/SparkGraphUtil.java @@ -20,6 +20,7 @@ public ZFrame, Row, Column> buildGraph(ZFrame, Row, Co // we need to transform the input here by using stop words //rename id field which is a common field in data to another field as it //clashes with graphframes :-( + vOrig = vOrig.cache(); Dataset vertices = vOrig.df(); Dataset edges = ed.df(); vertices = vertices.withColumnRenamed(ColName.ID_EXTERNAL_ORIG_COL, ColName.ID_EXTERNAL_COL); From fb4115df83e4e3de507ad4d7d97d8ed06c5a904d Mon Sep 17 00:00:00 2001 From: sania-16 Date: Tue, 23 Jul 2024 11:23:02 +0530 Subject: [PATCH 199/219] match type issue#839 --- examples/febrl120k/config120k.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/febrl120k/config120k.json b/examples/febrl120k/config120k.json index 235738655..e16f979d7 100644 --- a/examples/febrl120k/config120k.json +++ b/examples/febrl120k/config120k.json @@ -2,7 +2,7 @@ "fieldDefinition":[ { "fieldName" : "fname", - "matchType" : "email", + "matchType" : "fuzzy", "fields" : "fname", "dataType": "string" }, From 561c35d8251ced5ab32e8aa0a72b51bbef5da44f Mon Sep 17 00:00:00 2001 From: sania-16 Date: Tue, 30 Jul 2024 12:06:46 +0530 Subject: [PATCH 200/219] defining fuction getClientOptions for EClientOptions --- common/client/src/main/java/zingg/common/client/Client.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 05fb60d64..88a85ef78 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -184,6 +184,10 @@ public void printAnalyticsBanner(boolean collectMetrics) { } public abstract Client getClient(IArguments args, ClientOptions options) throws ZinggClientException; + + public ClientOptions getClientOptions(String ... args){ + return new ClientOptions(args); + } public void mainMethod(String... args) { printBanner(); @@ -192,7 +196,7 @@ public void mainMethod(String... args) { try { for (String a: args) LOG.debug("args " + a); - options = new ClientOptions(args); + options = getClientOptions(args); setOptions(options); if (options.has(options.HELP) || options.has(options.HELP1) || options.get(ClientOptions.PHASE) == null) { From 0f18bdefcf6e4c3d1b9fd63874c9f278481b2f06 Mon Sep 17 00:00:00 2001 From: sania-16 Date: Wed, 31 Jul 2024 12:48:49 +0530 Subject: [PATCH 201/219] code refactoring of ClientOptions --- .../zingg/common/client/ClientOptions.java | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ClientOptions.java b/common/client/src/main/java/zingg/common/client/ClientOptions.java index e35d4a541..430cbe47b 100644 --- a/common/client/src/main/java/zingg/common/client/ClientOptions.java +++ b/common/client/src/main/java/zingg/common/client/ClientOptions.java @@ -43,7 +43,7 @@ public class ClientOptions { protected String[] commandLineArgs; - protected static Map optionMaster = new HashMap(); + protected Map optionMaster = new HashMap(); /* * String optionName; //String alias; @@ -52,7 +52,7 @@ public class ClientOptions { boolean isExit; boolean isMandatory; */ - static { //This is the canonical list of Zingg options. + protected void loadOptions() { //This is the canonical list of Zingg options. optionMaster.put(CONF, new Option(CONF, true, "JSON configuration with data input output locations and field definitions", false, true)); optionMaster.put(PHASE, new Option(PHASE, true, Util.join(ZinggOptions.getAllZinggOptions(), "|"), false, true, ZinggOptions.getAllZinggOptions())); optionMaster.put(LICENSE, new Option(LICENSE, true, "location of license file", false, true)); @@ -76,13 +76,19 @@ public class ClientOptions { } protected Map options = new HashMap (); + + public ClientOptions(){ + loadOptions(); + } public ClientOptions(String... args) { + this(); this.commandLineArgs = args; parse(Arrays.asList(args)); } public ClientOptions(List args) { + this(); this.commandLineArgs = args.toArray(new String[args.size()]); parse(args); } @@ -90,8 +96,15 @@ public ClientOptions(List args) { public String[] getCommandLineArgs() { return this.commandLineArgs; } + + public Map getOptionMaster(){ + return optionMaster; + } - + public void setOptionMaster(Map optionMaster) { + this.optionMaster = optionMaster; + } + /** * Parse a list of Zingg command line options. *

@@ -250,12 +263,13 @@ public final static String getHelp() { s.append("options\n"); int maxlo = 0; - for (Option o: optionMaster.values()){ + ClientOptions co = new ClientOptions(); + for (Option o: co.optionMaster.values()){ maxlo=Math.max(maxlo,o.optionName.length()); } int maxld = 0; - for (Option o: optionMaster.values()){ + for (Option o: co.optionMaster.values()){ maxld=Math.max(maxld,o.desc.length()); } @@ -263,7 +277,7 @@ public final static String getHelp() { formatBuilder.append("\t").append("%-").append(maxlo + 5).append("s").append(": ").append("%-").append(maxld + 5).append("s").append("\n"); String format = formatBuilder.toString(); - for (Option o: optionMaster.values()) { + for (Option o: co.optionMaster.values()) { s.append(String.format(format,o.optionName, o.desc)); } return s.toString(); From b36721ca73797a0a502f8c523dae090c7eb66565 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Fri, 9 Aug 2024 03:06:45 +0530 Subject: [PATCH 202/219] Update amazonS3.md doc changes for AWS S3 --- docs/dataSourcesAndSinks/amazonS3.md | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/docs/dataSourcesAndSinks/amazonS3.md b/docs/dataSourcesAndSinks/amazonS3.md index 5ee47236f..eea9261ef 100644 --- a/docs/dataSourcesAndSinks/amazonS3.md +++ b/docs/dataSourcesAndSinks/amazonS3.md @@ -1,25 +1,30 @@ # S3 -1. Set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg +Zingg can use AWS S3 as a source and sink -2. Create aws access key and export via env vars (ensure that the user with below keys has read/write access to above): +## Steps to run zingg on S3 -export AWS_ACCESS_KEY_ID= -export AWS_SECRET_ACCESS_KEY= +* Set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg -(if mfa is enabled AWS_SESSION_TOKEN env var would also be needed ) +* Create aws access key and export via env vars (ensure that the user with below keys has read/write access to above) + export AWS_ACCESS_KEY_ID= + export AWS_SECRET_ACCESS_KEY= + (if mfa is enabled AWS_SESSION_TOKEN env var would also be needed ) -3. Download hadoop-aws-3.1.0.jar and aws-java-sdk-bundle-1.11.271.jar via maven +* Download hadoop-aws-3.1.0.jar and aws-java-sdk-bundle-1.11.271.jar via maven -4. Set above in zingg.conf : -spark.jars=//hadoop-aws-3.1.0.jar,//aws-java-sdk-bundle-1.11.271.jar +* Set above in zingg.conf + spark.jars=//hadoop-aws-3.1.0.jar,//aws-java-sdk-bundle-1.11.271.jar -5. Run using: +* Run using below commands +```bash ./scripts/zingg.sh --phase findTrainingData --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg ./scripts/zingg.sh --phase label --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg ./scripts/zingg.sh --phase train --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg ./scripts/zingg.sh --phase match --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + ``` -6. Models etc. would get saved in -Amazon S3 > Buckets > zingg28032023 >zingg > 100 + ## Model location + Models etc. would get saved in + **Amazon S3 > Buckets > zingg28032023 >zingg > 100** From 513581c37fb0adc710a21e34bc9c4fb8e2513940 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Fri, 9 Aug 2024 03:08:06 +0530 Subject: [PATCH 203/219] Update amazonS3.md --- docs/dataSourcesAndSinks/amazonS3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dataSourcesAndSinks/amazonS3.md b/docs/dataSourcesAndSinks/amazonS3.md index eea9261ef..16f0db960 100644 --- a/docs/dataSourcesAndSinks/amazonS3.md +++ b/docs/dataSourcesAndSinks/amazonS3.md @@ -27,4 +27,4 @@ Zingg can use AWS S3 as a source and sink ## Model location Models etc. would get saved in - **Amazon S3 > Buckets > zingg28032023 >zingg > 100** + *Amazon S3 > Buckets > zingg28032023 >zingg > 100* From fe0b5910f9f9e193c417d915264ef7d051abcef6 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Fri, 9 Aug 2024 03:08:35 +0530 Subject: [PATCH 204/219] Update amazonS3.md --- docs/dataSourcesAndSinks/amazonS3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dataSourcesAndSinks/amazonS3.md b/docs/dataSourcesAndSinks/amazonS3.md index 16f0db960..01fc7e23b 100644 --- a/docs/dataSourcesAndSinks/amazonS3.md +++ b/docs/dataSourcesAndSinks/amazonS3.md @@ -27,4 +27,4 @@ Zingg can use AWS S3 as a source and sink ## Model location Models etc. would get saved in - *Amazon S3 > Buckets > zingg28032023 >zingg > 100* + **"Amazon S3 > Buckets > zingg28032023 >zingg > 100"** From 2e9a3e595c4f527d84df0080183123d3bb93884c Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Fri, 9 Aug 2024 03:10:02 +0530 Subject: [PATCH 205/219] Update amazonS3.md --- docs/dataSourcesAndSinks/amazonS3.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/dataSourcesAndSinks/amazonS3.md b/docs/dataSourcesAndSinks/amazonS3.md index 01fc7e23b..5f80462de 100644 --- a/docs/dataSourcesAndSinks/amazonS3.md +++ b/docs/dataSourcesAndSinks/amazonS3.md @@ -26,5 +26,5 @@ Zingg can use AWS S3 as a source and sink ``` ## Model location - Models etc. would get saved in - **"Amazon S3 > Buckets > zingg28032023 >zingg > 100"** +Models etc. would get saved in +**Amazon S3 > Buckets > zingg28032023 >zingg > 100** From efe7cd4976af9961dacefaaf5d16bfb8d109eb8a Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Fri, 9 Aug 2024 03:10:40 +0530 Subject: [PATCH 206/219] Update amazonS3.md final changes --- docs/dataSourcesAndSinks/amazonS3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dataSourcesAndSinks/amazonS3.md b/docs/dataSourcesAndSinks/amazonS3.md index 5f80462de..d147422ae 100644 --- a/docs/dataSourcesAndSinks/amazonS3.md +++ b/docs/dataSourcesAndSinks/amazonS3.md @@ -27,4 +27,4 @@ Zingg can use AWS S3 as a source and sink ## Model location Models etc. would get saved in -**Amazon S3 > Buckets > zingg28032023 >zingg > 100** +Amazon S3 > Buckets > zingg28032023 >zingg > 100 From ed4577673542319b0cdd037c55792c7afece2ef7 Mon Sep 17 00:00:00 2001 From: Nitish1814 Date: Fri, 9 Aug 2024 03:10:55 +0530 Subject: [PATCH 207/219] Update amazonS3.md --- docs/dataSourcesAndSinks/amazonS3.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/dataSourcesAndSinks/amazonS3.md b/docs/dataSourcesAndSinks/amazonS3.md index d147422ae..7ccf1f728 100644 --- a/docs/dataSourcesAndSinks/amazonS3.md +++ b/docs/dataSourcesAndSinks/amazonS3.md @@ -26,5 +26,5 @@ Zingg can use AWS S3 as a source and sink ``` ## Model location -Models etc. would get saved in -Amazon S3 > Buckets > zingg28032023 >zingg > 100 + Models etc. would get saved in + Amazon S3 > Buckets > zingg28032023 >zingg > 100 From 192f0059f1e091ebd65ac66a99f469e16a9e681a Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 9 Aug 2024 16:02:35 +0530 Subject: [PATCH 208/219] test that model got created in train phase --- .../core/executor/TestExecutorsGeneric.java | 4 +- .../common/core/executor/TrainerTester.java | 14 +++---- .../core/executor/SparkTrainerTester.java | 37 +++++++++++++++++++ .../core/executor/TestSparkExecutors.java | 7 ++++ 4 files changed, 54 insertions(+), 8 deletions(-) create mode 100644 spark/core/src/test/java/zingg/spark/core/executor/SparkTrainerTester.java diff --git a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java index c9c3d53dd..6de3c9813 100644 --- a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java +++ b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java @@ -72,7 +72,7 @@ public void testExecutors() throws ZinggClientException { Trainer trainer = getTrainer(); trainer.init(args,session); - TrainerTester tt = new TrainerTester(trainer); + TrainerTester tt = getTrainerTester(trainer); executorTesterList.add(tt); Matcher matcher = getMatcher(); @@ -82,6 +82,8 @@ public void testExecutors() throws ZinggClientException { testExecutors(executorTesterList); } + + protected abstract TrainerTester getTrainerTester(Trainer trainer); public void testExecutors(List> executorTesterList) throws ZinggClientException { diff --git a/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java b/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java index 76d15e708..b5f0cbbd9 100644 --- a/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java +++ b/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java @@ -3,17 +3,17 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -public class TrainerTester extends ExecutorTester { +import zingg.common.client.IArguments; + +public abstract class TrainerTester extends ExecutorTester { public static final Log LOG = LogFactory.getLog(TrainerTester.class); - public TrainerTester(Trainer executor) { + protected IArguments args; + + public TrainerTester(Trainer executor,IArguments args) { super(executor); - } - - @Override - public void validateResults() { - LOG.info("train successful"); + this.args = args; } } diff --git a/spark/core/src/test/java/zingg/spark/core/executor/SparkTrainerTester.java b/spark/core/src/test/java/zingg/spark/core/executor/SparkTrainerTester.java new file mode 100644 index 000000000..db1e45f09 --- /dev/null +++ b/spark/core/src/test/java/zingg/spark/core/executor/SparkTrainerTester.java @@ -0,0 +1,37 @@ +package zingg.spark.core.executor; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.File; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; + +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; +import zingg.common.core.executor.Trainer; +import zingg.common.core.executor.TrainerTester; + +public class SparkTrainerTester extends TrainerTester,Row,Column,DataType> { + + public static final Log LOG = LogFactory.getLog(SparkTrainerTester.class); + + public SparkTrainerTester(Trainer,Row,Column,DataType> executor,IArguments args) { + super(executor,args); + } + + @Override + public void validateResults() throws ZinggClientException { + // check that model is created + LOG.info("Zingg Model Dir : "+args.getZinggModelDir()); + + File modelDir = new File(args.getZinggModelDir()); + assertTrue(modelDir.exists(),"check if model has been created"); + } + +} diff --git a/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java index 8128fdc1a..e948393d9 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java @@ -15,6 +15,8 @@ import zingg.common.client.ZinggClientException; import zingg.common.core.executor.Labeller; import zingg.common.core.executor.TestExecutorsGeneric; +import zingg.common.core.executor.Trainer; +import zingg.common.core.executor.TrainerTester; import zingg.spark.core.context.ZinggSparkContext; public class TestSparkExecutors extends TestExecutorsGeneric,Row,Column,DataType> { @@ -75,6 +77,11 @@ public String setupArgs() throws ZinggClientException, IOException { return configFile; } + @Override + protected SparkTrainerTester getTrainerTester(Trainer,Row,Column,DataType> trainer) { + return new SparkTrainerTester(trainer,args); + } + @Override @AfterEach public void tearDown() { From 9d4cb4ccc866c8ed2c33ce7ccfe9512cc3d9411b Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 12 Aug 2024 01:12:23 +0530 Subject: [PATCH 209/219] test changes in Block related classes --- common/client/pom.xml | 16 ---- .../common/core/block/TestBlockBase.java | 4 +- .../java/zingg/common/core/data/TestData.java | 86 +++++++++---------- .../java/zingg/common/core/model/Event.java | 11 +-- .../zingg/common/core/model/EventBase.java | 15 ++++ .../zingg/common/core/model/EventCluster.java | 23 ----- .../zingg/common/core/model/EventPair.java | 16 ++++ pom.xml | 12 +++ .../common/core/block/TestSparkBlock.java | 16 ++-- .../spark/core/executor/ZinggSparkTester.java | 12 --- 10 files changed, 98 insertions(+), 113 deletions(-) create mode 100644 common/core/src/test/java/zingg/common/core/model/EventBase.java delete mode 100644 common/core/src/test/java/zingg/common/core/model/EventCluster.java create mode 100644 common/core/src/test/java/zingg/common/core/model/EventPair.java diff --git a/common/client/pom.xml b/common/client/pom.xml index 8e55122e5..c67339949 100644 --- a/common/client/pom.xml +++ b/common/client/pom.xml @@ -14,20 +14,4 @@ 1.4 - - - - org.apache.maven.plugins - maven-jar-plugin - ${maven-jar-plugin.version} - - - - test-jar - - - - - - diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 66b2711a0..57f85c55f 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -18,7 +18,7 @@ import zingg.common.core.util.BlockingTreeUtil; import zingg.common.core.util.HashUtil; import zingg.common.core.model.Event; -import zingg.common.core.model.EventCluster; +import zingg.common.core.model.EventPair; import zingg.common.core.data.TestData; public abstract class TestBlockBase { @@ -39,7 +39,7 @@ public void testTree() throws Throwable { // form tree ZFrame zFrameEvent = dfObjectUtil.getDFFromObjectList(TestData.createSampleEventData(), Event.class); - ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(TestData.createSampleClusterEventData(), EventCluster.class); + ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(TestData.createSampleClusterEventData(), EventPair.class); IArguments args = getArguments(); Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(zFrameEvent, zFrameEventCluster, 0.5, -1, diff --git a/common/core/src/test/java/zingg/common/core/data/TestData.java b/common/core/src/test/java/zingg/common/core/data/TestData.java index 7418ac21c..9867659cd 100644 --- a/common/core/src/test/java/zingg/common/core/data/TestData.java +++ b/common/core/src/test/java/zingg/common/core/data/TestData.java @@ -1,7 +1,7 @@ package zingg.common.core.data; import zingg.common.core.model.Event; -import zingg.common.core.model.EventCluster; +import zingg.common.core.model.EventPair; import zingg.common.core.model.Statement; import zingg.common.core.model.PostStopWordProcess; import zingg.common.core.model.PriorStopWordProcess; @@ -88,50 +88,50 @@ public static List createSampleEventData() { return sample; } - public static List createSampleClusterEventData() { + public static List createSampleClusterEventData() { int row_id = 1; - List sample = new ArrayList<>(); - sample.add(new EventCluster(row_id++, 1942, "quit Nation", "Mahatma",1942, "quit Nation", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma",1942, "quit N", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); - sample.add(new EventCluster(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); - sample.add(new EventCluster(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); - sample.add(new EventCluster(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); - sample.add(new EventCluster(row_id, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + List sample = new ArrayList<>(); + sample.add(new EventPair(row_id++, 1942, "quit Nation", "Mahatma",1942, "quit Nation", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma",1942, "quit N", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); + sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); + sample.add(new EventPair(row_id++, 1942, "quit ", "Mahatm", 1942, "quit ", "Mahatm", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Ntn", "Mahama", 1942, "quit Ntn", "Mahama", 1L)); + sample.add(new EventPair(row_id++, 1942, "quit Natin", "Mahaatma", 1942, "quit Natin", "Mahaatma", 1L)); + sample.add(new EventPair(row_id, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); return sample; } diff --git a/common/core/src/test/java/zingg/common/core/model/Event.java b/common/core/src/test/java/zingg/common/core/model/Event.java index 0bad3e883..d4ef977bc 100644 --- a/common/core/src/test/java/zingg/common/core/model/Event.java +++ b/common/core/src/test/java/zingg/common/core/model/Event.java @@ -1,15 +1,8 @@ package zingg.common.core.model; -public class Event { - public final Integer id; - public final Integer year; - public final String event; - public final String comment; +public class Event extends EventBase{ public Event(Integer id, Integer year, String event, String comment) { - this.id = id; - this.year = year; - this.event = event; - this.comment = comment; + super(id, year, event, comment); } } diff --git a/common/core/src/test/java/zingg/common/core/model/EventBase.java b/common/core/src/test/java/zingg/common/core/model/EventBase.java new file mode 100644 index 000000000..018fdf486 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/EventBase.java @@ -0,0 +1,15 @@ +package zingg.common.core.model; + +public class EventBase { + public final Integer id; + public final Integer year; + public final String event; + public final String comment; + + public EventBase(Integer id, Integer year, String event, String comment) { + this.id = id; + this.year = year; + this.event = event; + this.comment = comment; + } +} diff --git a/common/core/src/test/java/zingg/common/core/model/EventCluster.java b/common/core/src/test/java/zingg/common/core/model/EventCluster.java deleted file mode 100644 index f4697cf28..000000000 --- a/common/core/src/test/java/zingg/common/core/model/EventCluster.java +++ /dev/null @@ -1,23 +0,0 @@ -package zingg.common.core.model; - -public class EventCluster { - public final Integer id; - public final Integer year; - public final String event; - public final String comment; - public final Integer z_year; - public final String z_event; - public final String z_comment; - public final Long z_zid; - - public EventCluster(Integer id, Integer year, String event, String comment, Integer z_year, String z_event, String z_comment, Long z_zid) { - this.id = id; - this.year = year; - this.event = event; - this.comment = comment; - this.z_year = z_year; - this.z_event = z_event; - this.z_comment = z_comment; - this.z_zid = z_zid; - } -} \ No newline at end of file diff --git a/common/core/src/test/java/zingg/common/core/model/EventPair.java b/common/core/src/test/java/zingg/common/core/model/EventPair.java new file mode 100644 index 000000000..97be67d1f --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/model/EventPair.java @@ -0,0 +1,16 @@ +package zingg.common.core.model; + +public class EventPair extends EventBase{ + public final Integer z_year; + public final String z_event; + public final String z_comment; + public final Long z_zid; + + public EventPair(Integer id, Integer year, String event, String comment, Integer z_year, String z_event, String z_comment, Long z_zid) { + super(id, year, event, comment); + this.z_year = z_year; + this.z_event = z_event; + this.z_comment = z_comment; + this.z_zid = z_zid; + } +} \ No newline at end of file diff --git a/pom.xml b/pom.xml index c63f6ffcc..4669dc40d 100644 --- a/pom.xml +++ b/pom.xml @@ -152,6 +152,18 @@ + + org.apache.maven.plugins + maven-jar-plugin + ${maven-jar-plugin.version} + + + + test-jar + + + + maven-compiler-plugin ${maven-compiler-plugin.version} diff --git a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java index 92e3a42f9..9ace40ea2 100644 --- a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java +++ b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java @@ -15,6 +15,7 @@ import zingg.common.client.util.WithSession; import zingg.spark.client.util.SparkDFObjectUtil; import zingg.spark.core.context.ZinggSparkContext; +import zingg.spark.core.executor.ZinggSparkTester; import zingg.spark.core.util.SparkBlockingTreeUtil; import zingg.spark.core.util.SparkHashUtil; @@ -38,16 +39,15 @@ public static void setup() { protected static void setUpSpark() { try { - spark = SparkSession - .builder() - .master("local[*]") - .appName("Zingg" + "Junit") - .getOrCreate(); - ctx = new JavaSparkContext(spark.sparkContext()); + + if(spark == null && ZinggSparkTester.spark == null) { + ZinggSparkTester.setup(); + } + spark = ZinggSparkTester.spark; + ctx = ZinggSparkTester.ctx; + zsCTX = ZinggSparkTester.zsCTX; iWithSession = new WithSession<>(); iWithSession.setSession(spark); - zsCTX = new ZinggSparkContext(); - zsCTX.init(spark); } catch (Throwable e) { if (LOG.isDebugEnabled()) e.printStackTrace(); diff --git a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java index d15109f75..80582f02c 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java @@ -20,15 +20,8 @@ import zingg.common.client.ArgumentsUtil; import zingg.common.client.IArguments; import zingg.common.client.IZingg; -import org.apache.spark.sql.SparkSession; -import zingg.spark.client.util.SparkDSUtil; -import zingg.spark.client.util.SparkPipeUtil; import zingg.spark.core.context.ZinggSparkContext; -import zingg.spark.core.util.SparkBlockingTreeUtil; -import zingg.spark.core.util.SparkGraphUtil; -import zingg.spark.core.util.SparkHashUtil; -import zingg.spark.core.util.SparkModelUtil; public class ZinggSparkTester { @@ -91,10 +84,5 @@ public Dataset createDFWithDoubles(int numRows, int numCols) { return spark.createDataFrame(nums, structType); - - - - - } } From 9089543559074f283a0793847bd5fb4f9a5a3704 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 12 Aug 2024 01:22:39 +0530 Subject: [PATCH 210/219] test changes in Block related classes --- .../src/test/java/zingg/common/core/block/TestSparkBlock.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java index 9ace40ea2..7bb8a5815 100644 --- a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java +++ b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java @@ -40,7 +40,7 @@ public static void setup() { protected static void setUpSpark() { try { - if(spark == null && ZinggSparkTester.spark == null) { + if(spark == null) { ZinggSparkTester.setup(); } spark = ZinggSparkTester.spark; From 72ad5712625e53f85449bf522f3c3004df535a28 Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 12 Aug 2024 11:57:08 +0530 Subject: [PATCH 211/219] class renamed --- .../zingg/common/client/TestZFrameBase.java | 4 ++-- .../common/core/block/TestBlockBase.java | 6 ++--- .../{TestData.java => EventTestData.java} | 2 +- .../core/preprocess/TestStopWordsBase.java | 22 +++++++++---------- .../java/zingg/client/TestSparkFrame.java | 2 -- 5 files changed, 17 insertions(+), 19 deletions(-) rename common/core/src/test/java/zingg/common/core/data/{TestData.java => EventTestData.java} (99%) diff --git a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java index beb376aee..65333939a 100644 --- a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java +++ b/common/client/src/test/java/zingg/common/client/TestZFrameBase.java @@ -45,7 +45,7 @@ public TestZFrameBase(DFObjectUtil dfObjectUtil) { @Test - public void testCreateSparkDataFrameAndGetDF() throws Exception { + public void testCreateZFrameAndGetDF() throws Exception { List sampleDataSet = createSampleDataList(); ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); @@ -64,7 +64,7 @@ public void testCreateSparkDataFrameAndGetDF() throws Exception { } @Test - public void testColumnsNamesandCount() throws Exception { + public void testColumnsNamesAndCount() throws Exception { List sampleDataSet = createSampleDataList(); ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); diff --git a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java index 57f85c55f..ecceb6201 100644 --- a/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java +++ b/common/core/src/test/java/zingg/common/core/block/TestBlockBase.java @@ -19,7 +19,7 @@ import zingg.common.core.util.HashUtil; import zingg.common.core.model.Event; import zingg.common.core.model.EventPair; -import zingg.common.core.data.TestData; +import zingg.common.core.data.EventTestData; public abstract class TestBlockBase { @@ -38,8 +38,8 @@ public TestBlockBase(DFObjectUtil dfObjectUtil, HashUtil zFrameEvent = dfObjectUtil.getDFFromObjectList(TestData.createSampleEventData(), Event.class); - ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(TestData.createSampleClusterEventData(), EventPair.class); + ZFrame zFrameEvent = dfObjectUtil.getDFFromObjectList(EventTestData.createSampleEventData(), Event.class); + ZFrame zFrameEventCluster = dfObjectUtil.getDFFromObjectList(EventTestData.createSampleClusterEventData(), EventPair.class); IArguments args = getArguments(); Tree> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(zFrameEvent, zFrameEventCluster, 0.5, -1, diff --git a/common/core/src/test/java/zingg/common/core/data/TestData.java b/common/core/src/test/java/zingg/common/core/data/EventTestData.java similarity index 99% rename from common/core/src/test/java/zingg/common/core/data/TestData.java rename to common/core/src/test/java/zingg/common/core/data/EventTestData.java index 9867659cd..6a65aca34 100644 --- a/common/core/src/test/java/zingg/common/core/data/TestData.java +++ b/common/core/src/test/java/zingg/common/core/data/EventTestData.java @@ -9,7 +9,7 @@ import java.util.ArrayList; import java.util.List; -public class TestData { +public class EventTestData { public static List createSampleEventData() { int row_id = 1; diff --git a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index 0c4d496a9..529cbb4d1 100644 --- a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -15,7 +15,7 @@ import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; import zingg.common.core.context.Context; -import zingg.common.core.data.TestData; +import zingg.common.core.data.EventTestData; import zingg.common.core.model.Statement; import zingg.common.core.model.PostStopWordProcess; import zingg.common.core.model.PriorStopWordProcess; @@ -42,8 +42,8 @@ public void testStopWordsSingleColumn() throws ZinggClientException, Exception { String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData1Original(), Statement.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData1Expected(), Statement.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData1Original(), Statement.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(EventTestData.getData1Expected(), Statement.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(0); @@ -57,8 +57,8 @@ public void testStopWordsSingleColumn() throws ZinggClientException, Exception { @Test public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData2Original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData2Expected(), PriorStopWordProcess.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(1); ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); @@ -70,8 +70,8 @@ public void testRemoveStopWordsFromDataset() throws ZinggClientException, Except @Test public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData3Original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData3Expected(), PriorStopWordProcess.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData3Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(EventTestData.getData3Expected(), PriorStopWordProcess.class); StopWordsRemover stopWordsRemover = stopWordsRemovers.get(2); ZFrame newZFrame = stopWordsRemover.preprocessForStopWords(zFrameOriginal); @@ -84,8 +84,8 @@ public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientExcept @Test public void testForOriginalDataAfterPostProcess() throws Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData4original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData4Expected(), PostStopWordProcess.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData4original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(EventTestData.getData4Expected(), PostStopWordProcess.class); ZFrame newZFrame = context.getDSUtil().postprocess(zFrameExpected, zFrameOriginal); @@ -96,8 +96,8 @@ public void testForOriginalDataAfterPostProcess() throws Exception { @Test public void testOriginalDataAfterPostProcessLinked() throws Exception { - ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(TestData.getData5Original(), PriorStopWordProcess.class); - ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(TestData.getData5Actual(), PostStopWordProcess.class); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData5Original(), PriorStopWordProcess.class); + ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(EventTestData.getData5Actual(), PostStopWordProcess.class); ZFrame newZFrame = context.getDSUtil().postprocessLinked(zFrameExpected, zFrameOriginal); diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java index ab496a77e..d3ee3728c 100644 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ b/spark/client/src/test/java/zingg/client/TestSparkFrame.java @@ -17,7 +17,6 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import zingg.common.client.IArguments; import zingg.common.client.TestZFrameBase; import zingg.common.client.ZFrame; import zingg.common.client.util.IWithSession; @@ -31,7 +30,6 @@ public class TestSparkFrame extends TestZFrameBase, Row, Column, DataType> { public static final Log LOG = LogFactory.getLog(TestSparkFrame.class); - public static IArguments args; public static JavaSparkContext ctx; public static SparkSession spark; public static IWithSession iWithSession; From b81907c2412da643232ee760275b9be7b552d84a Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 12 Aug 2024 12:05:16 +0530 Subject: [PATCH 212/219] dupe code removed --- .../src/main/java/zingg/common/core/block/Block.java | 12 ++++++------ .../main/java/zingg/spark/core/block/SparkBlock.java | 11 ++++++----- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/common/core/src/main/java/zingg/common/core/block/Block.java b/common/core/src/main/java/zingg/common/core/block/Block.java index 06c0e8c13..0fdd3665b 100644 --- a/common/core/src/main/java/zingg/common/core/block/Block.java +++ b/common/core/src/main/java/zingg/common/core/block/Block.java @@ -13,10 +13,13 @@ import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ListMap; +import zingg.common.core.feature.FeatureFactory; import zingg.common.core.hash.HashFunction; public abstract class Block implements Serializable { + private static final long serialVersionUID = 1L; + public static final Log LOG = LogFactory.getLog(Block.class); protected ZFrame dupes; @@ -117,24 +120,21 @@ public void estimateElimCount(Canopy c, long elimCount) { c.estimateElimCount(); } - public abstract T getDataTypeFromString(String t); - public CanopygetBestNode(Tree> tree, Canopyparent, Canopynode, List fieldsOfInterest) throws Exception { long least = Long.MAX_VALUE; int maxElimination = 0; Canopybest = null; - for (FieldDefinition field : fieldsOfInterest) { if (LOG.isDebugEnabled()){ LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt " - + getDataTypeFromString(field.getDataType())); + + getFeatureFactory().getDataTypeFromString(field.getDataType())); } //Class type = FieldClass.getFieldClassClass(field.getFieldClass()); FieldDefinition context = field; if (least ==0) break;//how much better can it get? // applicable functions - List> functions = functionsMap.get(getDataTypeFromString(field.getDataType())); + List> functions = functionsMap.get(getFeatureFactory().getDataTypeFromString(field.getDataType())); if (LOG.isDebugEnabled()){ LOG.debug("functions are " + functions); } @@ -404,7 +404,7 @@ public void printTree(Tree> tree, } } - + public abstract FeatureFactory getFeatureFactory(); } diff --git a/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java b/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java index 6b8568c42..3cff3a304 100644 --- a/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java +++ b/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java @@ -8,7 +8,9 @@ import zingg.common.client.ZFrame; import zingg.common.client.util.ListMap; import zingg.common.core.block.Block; +import zingg.common.core.feature.FeatureFactory; import zingg.common.core.hash.HashFunction; +import zingg.spark.core.feature.SparkFeatureFactory; public class SparkBlock extends Block, Row, Column, DataType> { @@ -22,11 +24,10 @@ public SparkBlock(ZFrame, Row, Column> training, ZFrame, Row, Column, DataType>> functionsMap, long maxSize) { super(training, dupes, functionsMap, maxSize); } - - + @Override - public DataType getDataTypeFromString(String t) { - return DataType.fromDDL(t); - } + public FeatureFactory getFeatureFactory() { + return new SparkFeatureFactory(); + } } From b370045d8f62823c0d1d440e861bfdac17a625ac Mon Sep 17 00:00:00 2001 From: Sonal Goyal Date: Mon, 12 Aug 2024 14:04:55 +0530 Subject: [PATCH 213/219] added max debug string for query plan config to Spark tester --- .../test/java/zingg/spark/core/executor/ZinggSparkTester.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java index d15109f75..ff557ebc0 100644 --- a/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java +++ b/spark/core/src/test/java/zingg/spark/core/executor/ZinggSparkTester.java @@ -48,14 +48,14 @@ public static void setup() { spark = SparkSession .builder() .master("local[*]") - .appName("Zingg" + "Junit") + .appName("ZinggJunit") + .config("spark.debug.maxToStringFields", 100) .getOrCreate(); ctx = new JavaSparkContext(spark.sparkContext()); JavaSparkContext.jarOfClass(IZingg.class); args = new Arguments(); zsCTX = new ZinggSparkContext(); zsCTX.init(spark); - } catch (Throwable e) { if (LOG.isDebugEnabled()) e.printStackTrace(); From 5a06679dcdd3e6edc7bf82328390c641251838fe Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 12 Aug 2024 16:58:23 +0530 Subject: [PATCH 214/219] class renamed --- .../core/preprocess/TestStopWordsBase.java | 19 ++- .../core/util/IStopWordRemoverUtility.java | 13 -- .../core/util/StopWordRemoverUtility.java | 58 ++++++++- .../common/core/zFrame}/TestZFrameBase.java | 51 +++++--- .../common/core/zFrame}/data/TestData.java | 16 +-- .../core/zFrame}/model/ClusterZScore.java | 2 +- .../zFrame}/model/InputWithZidAndSource.java | 2 +- .../core/zFrame}/model/PairPartOne.java | 2 +- .../core/zFrame}/model/PairPartTwo.java | 2 +- .../common/core/zFrame}/model/Person.java | 2 +- .../core/zFrame}/model/PersonMixed.java | 2 +- spark/client/pom.xml | 4 +- .../test/java/zingg/client/TestSparkBase.java | 47 ++++++++ .../java/zingg/client/TestSparkFrame.java | 114 ------------------ .../src/test/java/zingg/TestSparkBase.java | 46 +++++++ .../common/core/block/TestSparkBlock.java | 60 ++------- .../core/preprocess/TestSparkStopWords.java | 52 ++------ .../core/sparkFrame/TestSparkFrame.java | 22 ++++ .../util/SparkStopWordRemoverUtility.java | 55 ++------- 19 files changed, 260 insertions(+), 309 deletions(-) delete mode 100644 common/core/src/test/java/zingg/common/core/util/IStopWordRemoverUtility.java rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/TestZFrameBase.java (94%) rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/data/TestData.java (94%) rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/model/ClusterZScore.java (88%) rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/model/InputWithZidAndSource.java (88%) rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/model/PairPartOne.java (90%) rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/model/PairPartTwo.java (90%) rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/model/Person.java (91%) rename common/{client/src/test/java/zingg/common/client => core/src/test/java/zingg/common/core/zFrame}/model/PersonMixed.java (92%) create mode 100644 spark/client/src/test/java/zingg/client/TestSparkBase.java delete mode 100644 spark/client/src/test/java/zingg/client/TestSparkFrame.java create mode 100644 spark/core/src/test/java/zingg/TestSparkBase.java create mode 100644 spark/core/src/test/java/zingg/common/core/sparkFrame/TestSparkFrame.java diff --git a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java index 529cbb4d1..aff0fd439 100644 --- a/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java +++ b/common/core/src/test/java/zingg/common/core/preprocess/TestStopWordsBase.java @@ -9,7 +9,6 @@ import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; -import zingg.common.client.Arguments; import zingg.common.client.ZFrame; import zingg.common.client.ZinggClientException; import zingg.common.client.util.ColName; @@ -19,20 +18,19 @@ import zingg.common.core.model.Statement; import zingg.common.core.model.PostStopWordProcess; import zingg.common.core.model.PriorStopWordProcess; -import zingg.common.core.util.IStopWordRemoverUtility; +import zingg.common.core.util.StopWordRemoverUtility; public abstract class TestStopWordsBase { public static final Log LOG = LogFactory.getLog(TestStopWordsBase.class); private final DFObjectUtil dfObjectUtil; - private final List> stopWordsRemovers; + private final StopWordRemoverUtility stopWordRemoverUtility; private final Context context; - public TestStopWordsBase(DFObjectUtil dfObjectUtil, IStopWordRemoverUtility IStopWordRemoverUtility, - Context context) throws ZinggClientException { + public TestStopWordsBase(DFObjectUtil dfObjectUtil, StopWordRemoverUtility stopWordRemoverUtility, Context context) { this.dfObjectUtil = dfObjectUtil; - this.stopWordsRemovers = IStopWordRemoverUtility.getStopWordRemovers(context, new Arguments()); + this.stopWordRemoverUtility = stopWordRemoverUtility; this.context = context; } @@ -40,6 +38,7 @@ public TestStopWordsBase(DFObjectUtil dfObjectUtil, IStopWordRemover @Test public void testStopWordsSingleColumn() throws ZinggClientException, Exception { + List> stopWordsRemovers = getStopWordsRemovers(); String stopWords = "\\b(a|an|the|is|It|of|yes|no|I|has|have|you)\\b\\s?".toLowerCase(); ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData1Original(), Statement.class); @@ -57,6 +56,7 @@ public void testStopWordsSingleColumn() throws ZinggClientException, Exception { @Test public void testRemoveStopWordsFromDataset() throws ZinggClientException, Exception { + List> stopWordsRemovers = getStopWordsRemovers(); ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Original(), PriorStopWordProcess.class); ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Expected(), PriorStopWordProcess.class); @@ -70,6 +70,8 @@ public void testRemoveStopWordsFromDataset() throws ZinggClientException, Except @Test public void testStopWordColumnMissingFromStopWordFile() throws ZinggClientException, Exception { + List> stopWordsRemovers = getStopWordsRemovers(); + ZFrame zFrameOriginal = dfObjectUtil.getDFFromObjectList(EventTestData.getData3Original(), PriorStopWordProcess.class); ZFrame zFrameExpected = dfObjectUtil.getDFFromObjectList(EventTestData.getData3Expected(), PriorStopWordProcess.class); @@ -105,4 +107,9 @@ public void testOriginalDataAfterPostProcessLinked() throws Exception { assertTrue(zFrameOriginal.select("field1", "field2", "field3").except(newZFrame.select("field1", "field2", "field3")).isEmpty()); } + private List> getStopWordsRemovers() throws ZinggClientException { + stopWordRemoverUtility.buildStopWordRemovers(); + return stopWordRemoverUtility.getStopWordsRemovers(); + } + } \ No newline at end of file diff --git a/common/core/src/test/java/zingg/common/core/util/IStopWordRemoverUtility.java b/common/core/src/test/java/zingg/common/core/util/IStopWordRemoverUtility.java deleted file mode 100644 index d7e74f8f1..000000000 --- a/common/core/src/test/java/zingg/common/core/util/IStopWordRemoverUtility.java +++ /dev/null @@ -1,13 +0,0 @@ -package zingg.common.core.util; - -import zingg.common.client.IArguments; -import zingg.common.client.ZinggClientException; -import zingg.common.core.context.Context; -import zingg.common.core.preprocess.StopWordsRemover; - -import java.util.List; - -public interface IStopWordRemoverUtility { - - List> getStopWordRemovers(Context context, IArguments arguments) throws ZinggClientException; -} diff --git a/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java b/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java index 04871a606..611c36700 100644 --- a/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java +++ b/common/core/src/test/java/zingg/common/core/util/StopWordRemoverUtility.java @@ -1,13 +1,65 @@ package zingg.common.core.util; +import zingg.common.client.Arguments; +import zingg.common.client.FieldDefinition; import zingg.common.client.IArguments; +import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; -import zingg.common.core.context.Context; import zingg.common.core.preprocess.StopWordsRemover; +import java.util.ArrayList; import java.util.List; +import java.util.Objects; -public interface StopWordRemoverUtility { +public abstract class StopWordRemoverUtility { - List> getStopWordRemovers(Context context, IArguments arguments) throws ZinggClientException; + protected final List> stopWordsRemovers; + + public StopWordRemoverUtility() throws ZinggClientException { + this.stopWordsRemovers = new ArrayList>();; + } + + public void buildStopWordRemovers() throws ZinggClientException { + + //add first stopWordRemover + List fdList = new ArrayList(4); + ArrayList matchTypelistFuzzy = new ArrayList(); + matchTypelistFuzzy.add(MatchType.FUZZY); + FieldDefinition eventFD = new FieldDefinition(); + eventFD.setDataType("string"); + eventFD.setFieldName("statement"); + eventFD.setMatchType(matchTypelistFuzzy); + fdList.add(eventFD); + IArguments stmtArgs = new Arguments(); + stmtArgs.setFieldDefinition(fdList); + addStopWordRemover(stmtArgs); + + //add second stopWordRemover + String stopWordsFileName1 = Objects.requireNonNull( + StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); + FieldDefinition fieldDefinition1 = new FieldDefinition(); + fieldDefinition1.setStopWords(stopWordsFileName1); + fieldDefinition1.setFieldName("field1"); + List fieldDefinitionList1 = List.of(fieldDefinition1); + stmtArgs = new Arguments(); + stmtArgs.setFieldDefinition(fieldDefinitionList1); + addStopWordRemover(stmtArgs); + + //add third stopWordRemover + String stopWordsFileName2 = Objects.requireNonNull( + StopWordRemoverUtility.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); + FieldDefinition fieldDefinition2 = new FieldDefinition(); + fieldDefinition2.setStopWords(stopWordsFileName2); + fieldDefinition2.setFieldName("field1"); + List fieldDefinitionList2 = List.of(fieldDefinition2); + stmtArgs = new Arguments(); + stmtArgs.setFieldDefinition(fieldDefinitionList2); + addStopWordRemover(stmtArgs); + } + + public List> getStopWordsRemovers() { + return this.stopWordsRemovers; + } + + public abstract void addStopWordRemover(IArguments iArguments); } diff --git a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java b/common/core/src/test/java/zingg/common/core/zFrame/TestZFrameBase.java similarity index 94% rename from common/client/src/test/java/zingg/common/client/TestZFrameBase.java rename to common/core/src/test/java/zingg/common/core/zFrame/TestZFrameBase.java index 65333939a..c5bab8055 100644 --- a/common/client/src/test/java/zingg/common/client/TestZFrameBase.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/TestZFrameBase.java @@ -1,18 +1,19 @@ -package zingg.common.client; +package zingg.common.core.zFrame; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import zingg.common.client.ZFrame; import zingg.common.client.util.ColName; import zingg.common.client.util.DFObjectUtil; -import zingg.common.client.model.Person; -import zingg.common.client.model.PersonMixed; -import zingg.common.client.model.ClusterZScore; -import zingg.common.client.model.InputWithZidAndSource; -import zingg.common.client.model.PairPartOne; -import zingg.common.client.model.PairPartTwo; +import zingg.common.core.zFrame.model.ClusterZScore; +import zingg.common.core.zFrame.model.InputWithZidAndSource; +import zingg.common.core.zFrame.model.PairPartOne; +import zingg.common.core.zFrame.model.PairPartTwo; +import zingg.common.core.zFrame.model.Person; +import zingg.common.core.zFrame.model.PersonMixed; import java.lang.reflect.Field; import java.util.ArrayList; @@ -22,17 +23,17 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -import static zingg.common.client.data.TestData.createEmptySampleData; -import static zingg.common.client.data.TestData.createSampleDataCluster; -import static zingg.common.client.data.TestData.createSampleDataClusterWithNull; -import static zingg.common.client.data.TestData.createSampleDataInput; -import static zingg.common.client.data.TestData.createSampleDataList; -import static zingg.common.client.data.TestData.createSampleDataListDistinct; -import static zingg.common.client.data.TestData.createSampleDataListWithDistinctSurnameAndPostcode; -import static zingg.common.client.data.TestData.createSampleDataListWithMixedDataType; -import static zingg.common.client.data.TestData.createSampleDataZScore; - -public abstract class TestZFrameBase { +import static zingg.common.core.zFrame.data.TestData.createEmptySampleData; +import static zingg.common.core.zFrame.data.TestData.createSampleDataCluster; +import static zingg.common.core.zFrame.data.TestData.createSampleDataClusterWithNull; +import static zingg.common.core.zFrame.data.TestData.createSampleDataInput; +import static zingg.common.core.zFrame.data.TestData.createSampleDataList; +import static zingg.common.core.zFrame.data.TestData.createSampleDataListDistinct; +import static zingg.common.core.zFrame.data.TestData.createSampleDataListWithDistinctSurnameAndPostcode; +import static zingg.common.core.zFrame.data.TestData.createSampleDataListWithMixedDataType; +import static zingg.common.core.zFrame.data.TestData.createSampleDataZScore; + +public abstract class TestZFrameBase { public static final Log LOG = LogFactory.getLog(TestZFrameBase.class); public static final String NEW_COLUMN = "newColumn"; @@ -43,6 +44,16 @@ public TestZFrameBase(DFObjectUtil dfObjectUtil) { this.dfObjectUtil = dfObjectUtil; } + @Test + public void testAliasOfZFrame() throws Exception { + List sampleDataSet = createSampleDataList(); + ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); + + String aliasName = "AnotherName"; + zFrame.as(aliasName); + assertTrueCheckingExceptOutput(zFrame.as(aliasName), zFrame, "Dataframe and its alias are not same"); + } + @Test public void testCreateZFrameAndGetDF() throws Exception { @@ -626,4 +637,8 @@ public void testDistinct() throws Exception { } } } + + protected void assertTrueCheckingExceptOutput(ZFrame sf1, ZFrame sf2, String message) { + assertTrue(sf1.except(sf2).isEmpty(), message); + } } \ No newline at end of file diff --git a/common/client/src/test/java/zingg/common/client/data/TestData.java b/common/core/src/test/java/zingg/common/core/zFrame/data/TestData.java similarity index 94% rename from common/client/src/test/java/zingg/common/client/data/TestData.java rename to common/core/src/test/java/zingg/common/core/zFrame/data/TestData.java index 6c915617c..5bb0003a6 100644 --- a/common/client/src/test/java/zingg/common/client/data/TestData.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/data/TestData.java @@ -1,12 +1,12 @@ -package zingg.common.client.data; +package zingg.common.core.zFrame.data; -import zingg.common.client.model.Person; -import zingg.common.client.model.PairPartOne; -import zingg.common.client.model.PairPartTwo; -import zingg.common.client.model.InputWithZidAndSource; -import zingg.common.client.model.PersonMixed; -import zingg.common.client.model.ClusterZScore; +import zingg.common.core.zFrame.model.ClusterZScore; +import zingg.common.core.zFrame.model.InputWithZidAndSource; +import zingg.common.core.zFrame.model.PairPartOne; +import zingg.common.core.zFrame.model.PairPartTwo; +import zingg.common.core.zFrame.model.Person; +import zingg.common.core.zFrame.model.PersonMixed; import java.util.ArrayList; import java.util.List; @@ -16,7 +16,7 @@ public class TestData { //sample data classes to be used for testing public static List createEmptySampleData() { - return new ArrayList<>(); + return new ArrayList(); } public static List createSampleDataList() { diff --git a/common/client/src/test/java/zingg/common/client/model/ClusterZScore.java b/common/core/src/test/java/zingg/common/core/zFrame/model/ClusterZScore.java similarity index 88% rename from common/client/src/test/java/zingg/common/client/model/ClusterZScore.java rename to common/core/src/test/java/zingg/common/core/zFrame/model/ClusterZScore.java index 83785087f..e10788395 100644 --- a/common/client/src/test/java/zingg/common/client/model/ClusterZScore.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/model/ClusterZScore.java @@ -1,4 +1,4 @@ -package zingg.common.client.model; +package zingg.common.core.zFrame.model; public class ClusterZScore { public final Long z_zid; diff --git a/common/client/src/test/java/zingg/common/client/model/InputWithZidAndSource.java b/common/core/src/test/java/zingg/common/core/zFrame/model/InputWithZidAndSource.java similarity index 88% rename from common/client/src/test/java/zingg/common/client/model/InputWithZidAndSource.java rename to common/core/src/test/java/zingg/common/core/zFrame/model/InputWithZidAndSource.java index 78e6f4418..a370e9643 100644 --- a/common/client/src/test/java/zingg/common/client/model/InputWithZidAndSource.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/model/InputWithZidAndSource.java @@ -1,4 +1,4 @@ -package zingg.common.client.model; +package zingg.common.core.zFrame.model; public class InputWithZidAndSource { public final Long z_zid; diff --git a/common/client/src/test/java/zingg/common/client/model/PairPartOne.java b/common/core/src/test/java/zingg/common/core/zFrame/model/PairPartOne.java similarity index 90% rename from common/client/src/test/java/zingg/common/client/model/PairPartOne.java rename to common/core/src/test/java/zingg/common/core/zFrame/model/PairPartOne.java index e26dd1fd7..3f4ef6adc 100644 --- a/common/client/src/test/java/zingg/common/client/model/PairPartOne.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/model/PairPartOne.java @@ -1,4 +1,4 @@ -package zingg.common.client.model; +package zingg.common.core.zFrame.model; public class PairPartOne { public final Long z_zid; diff --git a/common/client/src/test/java/zingg/common/client/model/PairPartTwo.java b/common/core/src/test/java/zingg/common/core/zFrame/model/PairPartTwo.java similarity index 90% rename from common/client/src/test/java/zingg/common/client/model/PairPartTwo.java rename to common/core/src/test/java/zingg/common/core/zFrame/model/PairPartTwo.java index 6fb68c9c2..44e1ccd17 100644 --- a/common/client/src/test/java/zingg/common/client/model/PairPartTwo.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/model/PairPartTwo.java @@ -1,4 +1,4 @@ -package zingg.common.client.model; +package zingg.common.core.zFrame.model; public class PairPartTwo { public final Long z_z_zid; diff --git a/common/client/src/test/java/zingg/common/client/model/Person.java b/common/core/src/test/java/zingg/common/core/zFrame/model/Person.java similarity index 91% rename from common/client/src/test/java/zingg/common/client/model/Person.java rename to common/core/src/test/java/zingg/common/core/zFrame/model/Person.java index 8c12519fe..d1ea21612 100644 --- a/common/client/src/test/java/zingg/common/client/model/Person.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/model/Person.java @@ -1,4 +1,4 @@ -package zingg.common.client.model; +package zingg.common.core.zFrame.model; public class Person { public final String recid; diff --git a/common/client/src/test/java/zingg/common/client/model/PersonMixed.java b/common/core/src/test/java/zingg/common/core/zFrame/model/PersonMixed.java similarity index 92% rename from common/client/src/test/java/zingg/common/client/model/PersonMixed.java rename to common/core/src/test/java/zingg/common/core/zFrame/model/PersonMixed.java index d432370c2..a200c4f49 100644 --- a/common/client/src/test/java/zingg/common/client/model/PersonMixed.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/model/PersonMixed.java @@ -1,4 +1,4 @@ -package zingg.common.client.model; +package zingg.common.core.zFrame.model; public class PersonMixed { public final Integer recid; diff --git a/spark/client/pom.xml b/spark/client/pom.xml index e5c2146c4..3b86a0a93 100644 --- a/spark/client/pom.xml +++ b/spark/client/pom.xml @@ -8,8 +8,8 @@ zingg-spark-client jar - 2.12.0 - 2.12.0 + 2.15.2 + 2.15.2 diff --git a/spark/client/src/test/java/zingg/client/TestSparkBase.java b/spark/client/src/test/java/zingg/client/TestSparkBase.java new file mode 100644 index 000000000..69a3d7c3d --- /dev/null +++ b/spark/client/src/test/java/zingg/client/TestSparkBase.java @@ -0,0 +1,47 @@ +//package zingg.client; +// +//import org.apache.spark.sql.SparkSession; +//import org.junit.jupiter.api.extension.AfterAllCallback; +//import org.junit.jupiter.api.extension.BeforeAllCallback; +//import org.junit.jupiter.api.extension.ExtensionContext; +//import org.junit.jupiter.api.extension.ParameterContext; +//import org.junit.jupiter.api.extension.ParameterResolutionException; +//import org.junit.jupiter.api.extension.ParameterResolver; +// +//import zingg.spark.core.executor.ZinggSparkTester; +// +//public class TestSparkBase extends ZinggSparkTester implements BeforeAllCallback, AfterAllCallback, ParameterResolver{ +// +// public SparkSession sparkSession; +// +// static boolean isSetUp; +// +// @Override +// public boolean supportsParameter(ParameterContext parameterContext, ExtensionContext extensionContext) +// throws ParameterResolutionException { +// return parameterContext.getParameter().getType() +// .equals(SparkSession.class); +// } +// +// @Override +// public Object resolveParameter(ParameterContext parameterContext, ExtensionContext extensionContext) +// throws ParameterResolutionException { +// return sparkSession; +// } +// +// @Override +// public void afterAll(ExtensionContext context) throws Exception { +// +// } +// +// @Override +// public void beforeAll(ExtensionContext context) throws Exception { +// if (!isSetUp || sparkSession == null) { +// super.setup(); +// sparkSession = ZinggSparkTester.spark; +// } +// isSetUp = true; +// } +// +// +//} diff --git a/spark/client/src/test/java/zingg/client/TestSparkFrame.java b/spark/client/src/test/java/zingg/client/TestSparkFrame.java deleted file mode 100644 index d3ee3728c..000000000 --- a/spark/client/src/test/java/zingg/client/TestSparkFrame.java +++ /dev/null @@ -1,114 +0,0 @@ -package zingg.client; - - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import zingg.common.client.TestZFrameBase; -import zingg.common.client.ZFrame; -import zingg.common.client.util.IWithSession; -import zingg.common.client.util.WithSession; -import zingg.spark.client.SparkFrame; -import zingg.spark.client.util.SparkDFObjectUtil; - -import java.util.Arrays; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class TestSparkFrame extends TestZFrameBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkFrame.class); - public static JavaSparkContext ctx; - public static SparkSession spark; - public static IWithSession iWithSession; - - public TestSparkFrame() { - super(new SparkDFObjectUtil(iWithSession)); - } - - @BeforeAll - public static void setup() { - setUpSpark(); - } - - protected static void setUpSpark() { - try { - spark = SparkSession - .builder() - .master("local[*]") - .appName("Zingg" + "Junit") - .getOrCreate(); - ctx = new JavaSparkContext(spark.sparkContext()); - iWithSession = new WithSession<>(); - iWithSession.setSession(spark); - } catch (Throwable e) { - if (LOG.isDebugEnabled()) - e.printStackTrace(); - LOG.info("Problem in spark env setup"); - } - } - - @AfterAll - public static void teardown() { - if (ctx != null) { - ctx.stop(); - ctx = null; - } - if (spark != null) { - spark.stop(); - spark = null; - } - } - - @Test - public void testAliasOfSparkFrame() { - SparkFrame sf = new SparkFrame(createSampleDataset()); - String aliasName = "AnotherName"; - sf.as(aliasName); - assertTrueCheckingExceptOutput(sf.as(aliasName), sf, "Dataframe and its alias are not same"); - } - - public Dataset createSampleDataset() { - - if (spark == null) { - setUpSpark(); - } - - StructType schemaOfSample = new StructType(new StructField[]{ - new StructField("recid", DataTypes.StringType, false, Metadata.empty()), - new StructField("givenname", DataTypes.StringType, false, Metadata.empty()), - new StructField("surname", DataTypes.StringType, false, Metadata.empty()), - new StructField("suburb", DataTypes.StringType, false, Metadata.empty()), - new StructField("postcode", DataTypes.StringType, false, Metadata.empty()) - }); - - return spark.createDataFrame(Arrays.asList( - RowFactory.create("07317257", "erjc", "henson", "hendersonville", "2873g"), - RowFactory.create("03102490", "jhon", "kozak", "henders0nville", "28792"), - RowFactory.create("02890805", "david", "pisczek", "durham", "27717"), - RowFactory.create("04437063", "e5in", "bbrown", "greenville", "27858"), - RowFactory.create("03211564", "susan", "jones", "greenjboro", "274o7"), - RowFactory.create("04155808", "jerome", "wilkins", "battleborn", "2780g"), - RowFactory.create("05723231", "clarinw", "pastoreus", "elizabeth city", "27909"), - RowFactory.create("06087743", "william", "craven", "greenshoro", "27405"), - RowFactory.create("00538491", "marh", "jackdon", "greensboro", "27406"), - RowFactory.create("01306702", "vonnell", "palmer", "siler sity", "273q4")), schemaOfSample); - } - - protected void assertTrueCheckingExceptOutput(ZFrame, Row, Column> sf1, ZFrame, Row, Column> sf2, String message) { - assertTrue(sf1.except(sf2).isEmpty(), message); - } - -} \ No newline at end of file diff --git a/spark/core/src/test/java/zingg/TestSparkBase.java b/spark/core/src/test/java/zingg/TestSparkBase.java new file mode 100644 index 000000000..e04782700 --- /dev/null +++ b/spark/core/src/test/java/zingg/TestSparkBase.java @@ -0,0 +1,46 @@ +package zingg; + +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.extension.AfterAllCallback; +import org.junit.jupiter.api.extension.BeforeAllCallback; +import org.junit.jupiter.api.extension.ExtensionContext; +import org.junit.jupiter.api.extension.ParameterContext; +import org.junit.jupiter.api.extension.ParameterResolutionException; +import org.junit.jupiter.api.extension.ParameterResolver; +import zingg.spark.core.executor.ZinggSparkTester; + +public class TestSparkBase extends ZinggSparkTester implements BeforeAllCallback, AfterAllCallback, ParameterResolver{ + + public SparkSession sparkSession; + + static boolean isSetUp; + + @Override + public boolean supportsParameter(ParameterContext parameterContext, ExtensionContext extensionContext) + throws ParameterResolutionException { + return parameterContext.getParameter().getType() + .equals(SparkSession.class); + } + + @Override + public Object resolveParameter(ParameterContext parameterContext, ExtensionContext extensionContext) + throws ParameterResolutionException { + return sparkSession; + } + + @Override + public void afterAll(ExtensionContext context) { + + } + + @Override + public void beforeAll(ExtensionContext context) { + if (!isSetUp || sparkSession == null) { + super.setup(); + sparkSession = ZinggSparkTester.spark; + } + isSetUp = true; + } + + +} diff --git a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java index 7bb8a5815..d0471d9d4 100644 --- a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java +++ b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java @@ -1,69 +1,29 @@ package zingg.common.core.block; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataType; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import zingg.common.client.IArguments; +import org.junit.jupiter.api.extension.ExtendWith; +import zingg.TestSparkBase; +import zingg.common.client.ZinggClientException; import zingg.common.client.util.IWithSession; import zingg.common.client.util.WithSession; import zingg.spark.client.util.SparkDFObjectUtil; import zingg.spark.core.context.ZinggSparkContext; -import zingg.spark.core.executor.ZinggSparkTester; import zingg.spark.core.util.SparkBlockingTreeUtil; import zingg.spark.core.util.SparkHashUtil; +@ExtendWith(TestSparkBase.class) public class TestSparkBlock extends TestBlockBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkBlock.class); - public static IArguments args; - public static JavaSparkContext ctx; - public static ZinggSparkContext zsCTX; - public static SparkSession spark; - public static IWithSession iWithSession; + public static ZinggSparkContext zsCTX = new ZinggSparkContext(); + public static IWithSession iWithSession = new WithSession<>(); - public TestSparkBlock() { - super(new SparkDFObjectUtil(iWithSession), new SparkHashUtil(spark), new SparkBlockingTreeUtil(spark, zsCTX.getPipeUtil())); - } - - @BeforeAll - public static void setup() { - setUpSpark(); - } - - protected static void setUpSpark() { - try { - - if(spark == null) { - ZinggSparkTester.setup(); - } - spark = ZinggSparkTester.spark; - ctx = ZinggSparkTester.ctx; - zsCTX = ZinggSparkTester.zsCTX; - iWithSession = new WithSession<>(); - iWithSession.setSession(spark); - } catch (Throwable e) { - if (LOG.isDebugEnabled()) - e.printStackTrace(); - LOG.info("Problem in spark env setup"); - } - } - - @AfterAll - public static void teardown() { - if (ctx != null) { - ctx.stop(); - ctx = null; - } - if (spark != null) { - spark.stop(); - spark = null; - } + public TestSparkBlock(SparkSession sparkSession) throws ZinggClientException { + super(new SparkDFObjectUtil(iWithSession), new SparkHashUtil(sparkSession), new SparkBlockingTreeUtil(sparkSession, zsCTX.getPipeUtil())); + iWithSession.setSession(sparkSession); + zsCTX.init(sparkSession); } } diff --git a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java index 4c9ebc025..4887e3c09 100644 --- a/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java +++ b/spark/core/src/test/java/zingg/common/core/preprocess/TestSparkStopWords.java @@ -10,6 +10,8 @@ import org.apache.spark.sql.types.DataType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.extension.ExtendWith; +import zingg.TestSparkBase; import zingg.common.client.ZinggClientException; import zingg.common.client.util.IWithSession; import zingg.common.client.util.WithSession; @@ -17,51 +19,15 @@ import zingg.spark.client.util.SparkDFObjectUtil; import zingg.spark.core.context.ZinggSparkContext; +@ExtendWith(TestSparkBase.class) public class TestSparkStopWords extends TestStopWordsBase, Row, Column, DataType> { - public static final Log LOG = LogFactory.getLog(TestSparkStopWords.class); - public static JavaSparkContext ctx; - public static SparkSession spark; - public static ZinggSparkContext zsCTX; - public static IWithSession iWithSession; + public static IWithSession iWithSession = new WithSession(); + public static ZinggSparkContext zsCTX = new ZinggSparkContext(); - @BeforeAll - public static void setup() { - setUpSpark(); - } - - public TestSparkStopWords() throws ZinggClientException { - super(new SparkDFObjectUtil(iWithSession), new SparkStopWordRemoverUtility(), zsCTX); - } - - protected static void setUpSpark() { - try { - spark = SparkSession - .builder() - .master("local[*]") - .appName("Zingg" + "Junit") - .getOrCreate(); - ctx = new JavaSparkContext(spark.sparkContext()); - iWithSession = new WithSession<>(); - iWithSession.setSession(spark); - zsCTX = new ZinggSparkContext(); - zsCTX.init(spark); - } catch (Throwable e) { - if (LOG.isDebugEnabled()) - e.printStackTrace(); - LOG.info("Problem in spark env setup"); - } - } - - @AfterAll - public static void teardown() { - if (ctx != null) { - ctx.stop(); - ctx = null; - } - if (spark != null) { - spark.stop(); - spark = null; - } + public TestSparkStopWords(SparkSession sparkSession) throws ZinggClientException { + super(new SparkDFObjectUtil(iWithSession), new SparkStopWordRemoverUtility(zsCTX), zsCTX); + iWithSession.setSession(sparkSession); + zsCTX.init(sparkSession); } } diff --git a/spark/core/src/test/java/zingg/common/core/sparkFrame/TestSparkFrame.java b/spark/core/src/test/java/zingg/common/core/sparkFrame/TestSparkFrame.java new file mode 100644 index 000000000..4bddaa44a --- /dev/null +++ b/spark/core/src/test/java/zingg/common/core/sparkFrame/TestSparkFrame.java @@ -0,0 +1,22 @@ +package zingg.common.core.sparkFrame; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.extension.ExtendWith; +import zingg.TestSparkBase; +import zingg.common.client.util.IWithSession; +import zingg.common.client.util.WithSession; +import zingg.common.core.zFrame.TestZFrameBase; +import zingg.spark.client.util.SparkDFObjectUtil; + +@ExtendWith(TestSparkBase.class) +public class TestSparkFrame extends TestZFrameBase, Row, Column> { + public static IWithSession iWithSession = new WithSession(); + + public TestSparkFrame(SparkSession sparkSession) { + super(new SparkDFObjectUtil(iWithSession)); + iWithSession.setSession(sparkSession); + } +} \ No newline at end of file diff --git a/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java index 8057c636e..32af2bbbd 100644 --- a/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java +++ b/spark/core/src/test/java/zingg/common/core/util/SparkStopWordRemoverUtility.java @@ -5,59 +5,22 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataType; -import zingg.common.client.Arguments; -import zingg.common.client.FieldDefinition; import zingg.common.client.IArguments; -import zingg.common.client.MatchType; import zingg.common.client.ZinggClientException; import zingg.common.core.context.Context; -import zingg.common.core.preprocess.StopWordsRemover; import zingg.spark.core.preprocess.SparkStopWordsRemover; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; +public class SparkStopWordRemoverUtility extends StopWordRemoverUtility, Row, Column, DataType> { -public class SparkStopWordRemoverUtility implements IStopWordRemoverUtility, Row, Column, DataType> { + private final Context, Row, Column, DataType> context; - @Override - public List, Row, Column, DataType>> getStopWordRemovers(Context, Row, Column, DataType> context, IArguments arguments) throws ZinggClientException { - - List, Row, Column, DataType>> sparkStopWordsRemovers = new ArrayList<>(); - - //add first stopWordRemover - List fdList = new ArrayList(4); - ArrayList matchTypelistFuzzy = new ArrayList(); - matchTypelistFuzzy.add(MatchType.FUZZY); - FieldDefinition eventFD = new FieldDefinition(); - eventFD.setDataType("string"); - eventFD.setFieldName("statement"); - eventFD.setMatchType(matchTypelistFuzzy); - fdList.add(eventFD); - IArguments stmtArgs = new Arguments(); - stmtArgs.setFieldDefinition(fdList); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(context,stmtArgs)); - - //add second stopWordRemover - String stopWordsFileName1 = Objects.requireNonNull( - IStopWordRemoverUtility.class.getResource("../../../../preProcess/stopWords.csv")).getFile(); - FieldDefinition fieldDefinition1 = new FieldDefinition(); - fieldDefinition1.setStopWords(stopWordsFileName1); - fieldDefinition1.setFieldName("field1"); - List fieldDefinitionList1 = List.of(fieldDefinition1); - arguments.setFieldDefinition(fieldDefinitionList1); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(context, arguments)); - - //add third stopWordRemover - String stopWordsFileName2 = Objects.requireNonNull( - IStopWordRemoverUtility.class.getResource("../../../../preProcess/stopWordsWithoutHeader.csv")).getFile(); - FieldDefinition fieldDefinition2 = new FieldDefinition(); - fieldDefinition2.setStopWords(stopWordsFileName2); - fieldDefinition2.setFieldName("field1"); - List fieldDefinitionList2 = List.of(fieldDefinition2); - arguments.setFieldDefinition(fieldDefinitionList2); - sparkStopWordsRemovers.add(new SparkStopWordsRemover(context, arguments)); + public SparkStopWordRemoverUtility(Context, Row, Column, DataType> context) throws ZinggClientException { + super(); + this.context = context; + } - return sparkStopWordsRemovers; + @Override + public void addStopWordRemover(IArguments iArguments) { + super.stopWordsRemovers.add(new SparkStopWordsRemover(context, iArguments)); } } From e58f51bf5a1599d68892fdad963833aaa6be263a Mon Sep 17 00:00:00 2001 From: administrator Date: Mon, 12 Aug 2024 17:04:18 +0530 Subject: [PATCH 215/219] removed commented class --- .../test/java/zingg/client/TestSparkBase.java | 47 ------------------- 1 file changed, 47 deletions(-) delete mode 100644 spark/client/src/test/java/zingg/client/TestSparkBase.java diff --git a/spark/client/src/test/java/zingg/client/TestSparkBase.java b/spark/client/src/test/java/zingg/client/TestSparkBase.java deleted file mode 100644 index 69a3d7c3d..000000000 --- a/spark/client/src/test/java/zingg/client/TestSparkBase.java +++ /dev/null @@ -1,47 +0,0 @@ -//package zingg.client; -// -//import org.apache.spark.sql.SparkSession; -//import org.junit.jupiter.api.extension.AfterAllCallback; -//import org.junit.jupiter.api.extension.BeforeAllCallback; -//import org.junit.jupiter.api.extension.ExtensionContext; -//import org.junit.jupiter.api.extension.ParameterContext; -//import org.junit.jupiter.api.extension.ParameterResolutionException; -//import org.junit.jupiter.api.extension.ParameterResolver; -// -//import zingg.spark.core.executor.ZinggSparkTester; -// -//public class TestSparkBase extends ZinggSparkTester implements BeforeAllCallback, AfterAllCallback, ParameterResolver{ -// -// public SparkSession sparkSession; -// -// static boolean isSetUp; -// -// @Override -// public boolean supportsParameter(ParameterContext parameterContext, ExtensionContext extensionContext) -// throws ParameterResolutionException { -// return parameterContext.getParameter().getType() -// .equals(SparkSession.class); -// } -// -// @Override -// public Object resolveParameter(ParameterContext parameterContext, ExtensionContext extensionContext) -// throws ParameterResolutionException { -// return sparkSession; -// } -// -// @Override -// public void afterAll(ExtensionContext context) throws Exception { -// -// } -// -// @Override -// public void beforeAll(ExtensionContext context) throws Exception { -// if (!isSetUp || sparkSession == null) { -// super.setup(); -// sparkSession = ZinggSparkTester.spark; -// } -// isSetUp = true; -// } -// -// -//} From b330a03fd1659389d0d6a86936ebb1942e9a51fa Mon Sep 17 00:00:00 2001 From: administrator Date: Tue, 13 Aug 2024 19:54:03 +0530 Subject: [PATCH 216/219] removed <> everywhere in code --- .../zingg/common/core/data/EventTestData.java | 24 ++++++++-------- .../common/core/zFrame/TestZFrameBase.java | 28 +++++++++---------- .../common/core/zFrame/data/TestData.java | 8 +++--- .../common/core/block/TestSparkBlock.java | 2 +- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/common/core/src/test/java/zingg/common/core/data/EventTestData.java b/common/core/src/test/java/zingg/common/core/data/EventTestData.java index 6a65aca34..9531b6772 100644 --- a/common/core/src/test/java/zingg/common/core/data/EventTestData.java +++ b/common/core/src/test/java/zingg/common/core/data/EventTestData.java @@ -13,7 +13,7 @@ public class EventTestData { public static List createSampleEventData() { int row_id = 1; - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new Event(row_id++, 1942, "quit India", "Mahatma Gandhi")); sample.add(new Event(row_id++, 1919, "JallianWala", "Punjab")); sample.add(new Event(row_id++, 1930, "Civil Disob", "India")); @@ -91,7 +91,7 @@ public static List createSampleEventData() { public static List createSampleClusterEventData() { int row_id = 1; - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new EventPair(row_id++, 1942, "quit Nation", "Mahatma",1942, "quit Nation", "Mahatma", 1L)); sample.add(new EventPair(row_id++, 1919, "JallianWal", "Punjb", 1919, "JallianWal", "Punjb", 2L)); sample.add(new EventPair(row_id++, 1942, "quit N", "Mahatma", 1942, "quit N", "Mahatma", 1L)); @@ -138,7 +138,7 @@ public static List createSampleClusterEventData() { public static List getData1Original() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new Statement("The zingg is a Spark application")); sample.add(new Statement("It is very popular in data Science")); sample.add(new Statement("It is written in Java and Scala")); @@ -149,7 +149,7 @@ public static List getData1Original() { public static List getData1Expected() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new Statement("zingg spark application")); sample.add(new Statement("very popular in data science")); sample.add(new Statement("written in java and scala")); @@ -160,7 +160,7 @@ public static List getData1Expected() { public static List getData2Original() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); sample.add(new PriorStopWordProcess("20", "It is very popular in Data Science", "Three", "true indeed", @@ -173,7 +173,7 @@ public static List getData2Original() { public static List getData2Expected() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PriorStopWordProcess("10", "zingg spark application", "two", "Yes. a good application", "test")); sample.add(new PriorStopWordProcess("20", "very popular data science", "Three", "true indeed", "test")); sample.add(new PriorStopWordProcess("30", "written java scala", "four", "", "test")); @@ -184,7 +184,7 @@ public static List getData2Expected() { public static List getData3Original() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); sample.add(new PriorStopWordProcess("20", "It is very popular in Data Science", "Three", "true indeed", @@ -197,7 +197,7 @@ public static List getData3Original() { public static List getData3Expected() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PriorStopWordProcess("10", "zingg spark application", "two", "Yes. a good application", "test")); sample.add(new PriorStopWordProcess("20", "very popular data science", "Three", "true indeed", "test")); sample.add(new PriorStopWordProcess("30", "written java scala", "four", "", "test")); @@ -208,7 +208,7 @@ public static List getData3Expected() { public static List getData4original() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); sample.add(new PriorStopWordProcess("20", "It is very popular in data science", "Three", "true indeed", @@ -221,7 +221,7 @@ public static List getData4original() { public static List getData4Expected() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PostStopWordProcess("1648811730857:10", "10", "1.0", "0.555555", "-1", "The zingg spark application", "two", "Yes. good application", "test")); sample.add(new PostStopWordProcess("1648811730857:20", "20", "1.0", "1.0", "-1", @@ -236,7 +236,7 @@ public static List getData4Expected() { public static List getData5Original() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PriorStopWordProcess("10", "The zingg is a spark application", "two", "Yes. a good application", "test")); sample.add(new PriorStopWordProcess("20", "It is very popular in data science", "Three", "true indeed", @@ -249,7 +249,7 @@ public static List getData5Original() { public static List getData5Actual() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PostStopWordProcess("1648811730857:10", "10", "1.0", "0.555555", "-1", "The zingg spark application", "two", "Yes. good application", "test")); sample.add(new PostStopWordProcess("1648811730857:20", "20", "1.0", "1.0", "-1", diff --git a/common/core/src/test/java/zingg/common/core/zFrame/TestZFrameBase.java b/common/core/src/test/java/zingg/common/core/zFrame/TestZFrameBase.java index c5bab8055..b4bbbb2d9 100644 --- a/common/core/src/test/java/zingg/common/core/zFrame/TestZFrameBase.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/TestZFrameBase.java @@ -81,8 +81,8 @@ public void testColumnsNamesAndCount() throws Exception { ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); //assert on fields - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList(); + List fieldsInZFrame = new ArrayList(); Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); Arrays.stream(zFrame.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); assertEquals(fieldsInTestData, fieldsInZFrame, @@ -198,8 +198,8 @@ public void testDropSingleColumn() throws Exception { List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); - List fieldsInZFrame = new ArrayList<>(); - List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList(); + List fieldsInTestData = new ArrayList(); Arrays.stream(zFrame.drop("recid").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.remove("recid"); @@ -212,8 +212,8 @@ public void testDropColumnsAsStringArray() throws Exception { List sampleDataSet = createSampleDataList(); //List ZFrame zFrame = dfObjectUtil.getDFFromObjectList(sampleDataSet, Person.class); - List fieldsInZFrame = new ArrayList<>(); - List fieldsInTestData = new ArrayList<>(); + List fieldsInZFrame = new ArrayList(); + List fieldsInTestData = new ArrayList(); Arrays.stream(zFrame.drop("recid", "surname", "postcode").fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.remove("recid"); @@ -298,8 +298,8 @@ public void testWithColumnForIntegerValue() throws Exception { int newColVal = 36; ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList(); + List fieldsInZFrame = new ArrayList(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); @@ -322,8 +322,8 @@ public void testWithColumnForDoubleValue() throws Exception { double newColVal = 3.14; ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList(); + List fieldsInZFrame = new ArrayList(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); @@ -346,8 +346,8 @@ public void testWithColumnForStringValue() throws Exception { String newColVal = "zingg"; ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, newColVal); - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList(); + List fieldsInZFrame = new ArrayList(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); @@ -370,8 +370,8 @@ public void testWithColumnForAnotherColumn() throws Exception { String newCol = NEW_COLUMN; ZFrame zFrameWithAddedColumn = zFrame.withColumn(newCol, zFrame.col(oldCol)); - List fieldsInTestData = new ArrayList<>(); - List fieldsInZFrame = new ArrayList<>(); + List fieldsInTestData = new ArrayList(); + List fieldsInZFrame = new ArrayList(); Arrays.stream(zFrameWithAddedColumn.fields()).iterator().forEachRemaining(fieldZ -> fieldsInZFrame.add(fieldZ.getName())); Arrays.stream(Person.class.getFields()).sequential().forEach(fieldS -> fieldsInTestData.add(fieldS.getName())); fieldsInTestData.add(newCol); diff --git a/common/core/src/test/java/zingg/common/core/zFrame/data/TestData.java b/common/core/src/test/java/zingg/common/core/zFrame/data/TestData.java index 5bb0003a6..3bf63f254 100644 --- a/common/core/src/test/java/zingg/common/core/zFrame/data/TestData.java +++ b/common/core/src/test/java/zingg/common/core/zFrame/data/TestData.java @@ -90,7 +90,7 @@ public static List createSampleDataListWithMixedDataType() { public static List createSampleDataZScore() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new ClusterZScore(0L, "100", 900.0)); sample.add(new ClusterZScore(1L, "100", 1001.0)); sample.add(new ClusterZScore(1L, "100", 1002.0)); @@ -107,7 +107,7 @@ public static List createSampleDataZScore() { public static List createSampleDataCluster() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PairPartOne(1L, "100", 1001.0, "b")); sample.add(new PairPartOne(2L, "100", 1002.0, "a")); sample.add(new PairPartOne(3L, "100", 2001.0, "b")); @@ -119,7 +119,7 @@ public static List createSampleDataCluster() { public static List createSampleDataClusterWithNull() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new PairPartTwo(1L, "100", 1001.0, "b")); sample.add(new PairPartTwo(2L, "100", 1002.0, "a")); sample.add(new PairPartTwo(3L, "100", 2001.0, null)); @@ -131,7 +131,7 @@ public static List createSampleDataClusterWithNull() { public static List createSampleDataInput() { - List sample = new ArrayList<>(); + List sample = new ArrayList(); sample.add(new InputWithZidAndSource(1L, "fname1", "b")); sample.add(new InputWithZidAndSource(2L, "fname", "a")); sample.add(new InputWithZidAndSource(3L, "fna", "b")); diff --git a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java index d0471d9d4..0dcd25502 100644 --- a/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java +++ b/spark/core/src/test/java/zingg/common/core/block/TestSparkBlock.java @@ -19,7 +19,7 @@ public class TestSparkBlock extends TestBlockBase, Row, Column, DataType> { public static ZinggSparkContext zsCTX = new ZinggSparkContext(); - public static IWithSession iWithSession = new WithSession<>(); + public static IWithSession iWithSession = new WithSession(); public TestSparkBlock(SparkSession sparkSession) throws ZinggClientException { super(new SparkDFObjectUtil(iWithSession), new SparkHashUtil(sparkSession), new SparkBlockingTreeUtil(sparkSession, zsCTX.getPipeUtil())); From ca2610251300e73e32fe7f89c09c2abd2a44b312 Mon Sep 17 00:00:00 2001 From: semyonsinchenko Date: Sat, 24 Aug 2024 14:52:47 +0200 Subject: [PATCH 217/219] A part of changes --- pom.xml | 21 +- protobuf/connect_plugins.proto | 88 ++- python/requirements.txt | 2 +- python/zingg_v2/proto/connect_plugins_pb2.py | 22 +- python/zingg_v2/proto/connect_plugins_pb2.pyi | 320 +++++++++- .../spark/connect/ZinggConnectPlugin.java | 115 +++- .../spark/connect/proto/ConnectPlugins.java | 120 +++- .../spark/connect/proto/SubmitZinggJob.java | 583 ++++++++++++------ .../proto/SubmitZinggJobOrBuilder.java | 53 +- 9 files changed, 1082 insertions(+), 242 deletions(-) diff --git a/pom.xml b/pom.xml index 111119f49..960dc6422 100644 --- a/pom.xml +++ b/pom.xml @@ -48,37 +48,20 @@ spark-3.5 - false + true spark 3.5 - 3.5.0 + 3.5.2 2.12.10 3.5 2.12 0.8.3-spark3.5-s_2.12 - - spark-4.0 - - true - - spark - 4.0 - - - - 4.0.0-SNAPSHOT - 2.13.13 - 4.0 - 2.13 - 0.8.3-spark3.5-s_2.13 - - 0.4.0 diff --git a/protobuf/connect_plugins.proto b/protobuf/connect_plugins.proto index 085541b5f..9e265424e 100644 --- a/protobuf/connect_plugins.proto +++ b/protobuf/connect_plugins.proto @@ -4,6 +4,90 @@ option java_multiple_files = true; option java_package = "zingg.spark.connect.proto"; message SubmitZinggJob { - string args = 1; - string options = 2; + Arguments argumnets = 1; + ClientOptions cli_options = 2; + // The next message is a serialized LogicalPlan + optional bytes in_memory_date = 3; +} + +enum MatchType { + MT_FUZZY = 0; + MT_EXACT = 1; + MT_DONT_USE = 2; + MT_EMAIL = 3; + MT_PINCODE = 4; + MT_NULL_OR_BLANK = 5; + MT_TEXT = 6; + MT_NUMERIC = 7; + MT_NUMERIC_WITH_UNITS = 8; + MT_ONLY_ALPHABETS_EXACT = 9; + MT_ONLY_ALPHABETS_FUZZY = 10; +} + +enum DataFormat { + DF_CSV = 0; + DF_PARQUET = 1; + DF_JSON = 2; + DF_TEXT = 3; + DF_XLS = 4; + DF_AVRO = 5; + DF_JDBC = 6; + DF_CASSANDRA = 7; + DF_SNOWFLAKE = 8; + DF_ELASTIC = 9; + DF_EXACOL = 10; + DF_BIGQUEY = 11; + DF_INMEMORY = 12; +} + +message FieldDefinition { + MatchType match_type = 1; + string data_type = 2; + string field_name = 3; + string fields = 4; + optional string stop_words = 5; + optional string abbreviations = 6; +} + +message Pipe { + string name = 1; + DataFormat format = 2; + map props = 3; + optional string schema_field = 4; + optional string mode = 5; +} + +message Arguments { + repeated Pipe output = 1; + repeated Pipe data = 2; + string zingg_dir = 3; + repeated Pipe training_samples = 4; + repeated FieldDefinition fiield_definition = 5; + int32 num_partitions = 6; + float label_data_sample_size = 7; + string model_id = 8; + float threshold = 9; + int32 job_id = 10; + bool collect_metrics = 11; + bool show_concise = 12; + float stop_words_cutoff = 13; + int64 block_size = 14; + optional string column = 15; +} + +message ClientOptions { + string phase = 1; + string license = 2; + string email = 3; + string conf = 4; + optional string preprocess = 5; + optional string job_id = 6; + optional string format = 7; + optional string zingg_dir = 8; + optional string model_id = 9; + optional string collect_metrics = 10; + optional string show_concise = 11; + optional string location = 12; + optional string column = 13; + optional string remote = 14; } diff --git a/python/requirements.txt b/python/requirements.txt index 42c95be15..bf7be090e 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -3,5 +3,5 @@ seaborn matplotlib sphinx sphinx-rtd-theme -pyspark[connect]>=3.5 +pyspark[connect]>=3.5.2 pydantic diff --git a/python/zingg_v2/proto/connect_plugins_pb2.py b/python/zingg_v2/proto/connect_plugins_pb2.py index 9323eb5f6..8543fde20 100644 --- a/python/zingg_v2/proto/connect_plugins_pb2.py +++ b/python/zingg_v2/proto/connect_plugins_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63onnect_plugins.proto\">\n\x0eSubmitZinggJob\x12\x12\n\x04\x61rgs\x18\x01 \x01(\tR\x04\x61rgs\x12\x18\n\x07options\x18\x02 \x01(\tR\x07optionsB\x1d\n\x19zingg.spark.connect.protoP\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63onnect_plugins.proto\"\xa9\x01\n\x0eSubmitZinggJob\x12(\n\targumnets\x18\x01 \x01(\x0b\x32\n.ArgumentsR\targumnets\x12/\n\x0b\x63li_options\x18\x02 \x01(\x0b\x32\x0e.ClientOptionsR\ncliOptions\x12)\n\x0ein_memory_date\x18\x03 \x01(\x0cH\x00R\x0cinMemoryDate\x88\x01\x01\x42\x11\n\x0f_in_memory_date\"\x80\x02\n\x0f\x46ieldDefinition\x12)\n\nmatch_type\x18\x01 \x01(\x0e\x32\n.MatchTypeR\tmatchType\x12\x1b\n\tdata_type\x18\x02 \x01(\tR\x08\x64\x61taType\x12\x1d\n\nfield_name\x18\x03 \x01(\tR\tfieldName\x12\x16\n\x06\x66ields\x18\x04 \x01(\tR\x06\x66ields\x12\"\n\nstop_words\x18\x05 \x01(\tH\x00R\tstopWords\x88\x01\x01\x12)\n\rabbreviations\x18\x06 \x01(\tH\x01R\rabbreviations\x88\x01\x01\x42\r\n\x0b_stop_wordsB\x10\n\x0e_abbreviations\"\xfc\x01\n\x04Pipe\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12#\n\x06\x66ormat\x18\x02 \x01(\x0e\x32\x0b.DataFormatR\x06\x66ormat\x12&\n\x05props\x18\x03 \x03(\x0b\x32\x10.Pipe.PropsEntryR\x05props\x12&\n\x0cschema_field\x18\x04 \x01(\tH\x00R\x0bschemaField\x88\x01\x01\x12\x17\n\x04mode\x18\x05 \x01(\tH\x01R\x04mode\x88\x01\x01\x1a\x38\n\nPropsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0f\n\r_schema_fieldB\x07\n\x05_mode\"\xbe\x04\n\tArguments\x12\x1d\n\x06output\x18\x01 \x03(\x0b\x32\x05.PipeR\x06output\x12\x19\n\x04\x64\x61ta\x18\x02 \x03(\x0b\x32\x05.PipeR\x04\x64\x61ta\x12\x1b\n\tzingg_dir\x18\x03 \x01(\tR\x08zinggDir\x12\x30\n\x10training_samples\x18\x04 \x03(\x0b\x32\x05.PipeR\x0ftrainingSamples\x12=\n\x11\x66iield_definition\x18\x05 \x03(\x0b\x32\x10.FieldDefinitionR\x10\x66iieldDefinition\x12%\n\x0enum_partitions\x18\x06 \x01(\x05R\rnumPartitions\x12\x33\n\x16label_data_sample_size\x18\x07 \x01(\x02R\x13labelDataSampleSize\x12\x19\n\x08model_id\x18\x08 \x01(\tR\x07modelId\x12\x1c\n\tthreshold\x18\t \x01(\x02R\tthreshold\x12\x15\n\x06job_id\x18\n \x01(\x05R\x05jobId\x12\'\n\x0f\x63ollect_metrics\x18\x0b \x01(\x08R\x0e\x63ollectMetrics\x12!\n\x0cshow_concise\x18\x0c \x01(\x08R\x0bshowConcise\x12*\n\x11stop_words_cutoff\x18\r \x01(\x02R\x0fstopWordsCutoff\x12\x1d\n\nblock_size\x18\x0e \x01(\x05R\tblockSize\x12\x1b\n\x06\x63olumn\x18\x0f \x01(\tH\x00R\x06\x63olumn\x88\x01\x01\x42\t\n\x07_column\"\xc2\x04\n\rClientOptions\x12\x14\n\x05phase\x18\x01 \x01(\tR\x05phase\x12\x18\n\x07license\x18\x02 \x01(\tR\x07license\x12\x14\n\x05\x65mail\x18\x03 \x01(\tR\x05\x65mail\x12\x12\n\x04\x63onf\x18\x04 \x01(\tR\x04\x63onf\x12#\n\npreprocess\x18\x05 \x01(\tH\x00R\npreprocess\x88\x01\x01\x12\x1a\n\x06job_id\x18\x06 \x01(\tH\x01R\x05jobId\x88\x01\x01\x12\x1b\n\x06\x66ormat\x18\x07 \x01(\tH\x02R\x06\x66ormat\x88\x01\x01\x12 \n\tzingg_dir\x18\x08 \x01(\tH\x03R\x08zinggDir\x88\x01\x01\x12\x1e\n\x08model_id\x18\t \x01(\tH\x04R\x07modelId\x88\x01\x01\x12,\n\x0f\x63ollect_metrics\x18\n \x01(\tH\x05R\x0e\x63ollectMetrics\x88\x01\x01\x12&\n\x0cshow_concise\x18\x0b \x01(\tH\x06R\x0bshowConcise\x88\x01\x01\x12\x1f\n\x08location\x18\x0c \x01(\tH\x07R\x08location\x88\x01\x01\x12\x1b\n\x06\x63olumn\x18\r \x01(\tH\x08R\x06\x63olumn\x88\x01\x01\x12\x1b\n\x06remote\x18\x0e \x01(\tH\tR\x06remote\x88\x01\x01\x42\r\n\x0b_preprocessB\t\n\x07_job_idB\t\n\x07_formatB\x0c\n\n_zingg_dirB\x0b\n\t_model_idB\x12\n\x10_collect_metricsB\x0f\n\r_show_conciseB\x0b\n\t_locationB\t\n\x07_columnB\t\n\x07_remote*\xde\x01\n\tMatchType\x12\x0c\n\x08MT_FUZZY\x10\x00\x12\x0c\n\x08MT_EXACT\x10\x01\x12\x0f\n\x0bMT_DONT_USE\x10\x02\x12\x0c\n\x08MT_EMAIL\x10\x03\x12\x0e\n\nMT_PINCODE\x10\x04\x12\x14\n\x10MT_NULL_OR_BLANK\x10\x05\x12\x0b\n\x07MT_TEXT\x10\x06\x12\x0e\n\nMT_NUMERIC\x10\x07\x12\x19\n\x15MT_NUMERIC_WITH_UNITS\x10\x08\x12\x1b\n\x17MT_ONLY_ALPHABETS_EXACT\x10\t\x12\x1b\n\x17MT_ONLY_ALPHABETS_FUZZY\x10\n*\xcc\x01\n\nDataFormat\x12\n\n\x06\x44\x46_CSV\x10\x00\x12\x0e\n\nDF_PARQUET\x10\x01\x12\x0b\n\x07\x44\x46_JSON\x10\x02\x12\x0b\n\x07\x44\x46_TEXT\x10\x03\x12\n\n\x06\x44\x46_XLS\x10\x04\x12\x0b\n\x07\x44\x46_AVRO\x10\x05\x12\x0b\n\x07\x44\x46_JDBC\x10\x06\x12\x10\n\x0c\x44\x46_CASSANDRA\x10\x07\x12\x10\n\x0c\x44\x46_SNOWFLAKE\x10\x08\x12\x0e\n\nDF_ELASTIC\x10\t\x12\r\n\tDF_EXACOL\x10\n\x12\x0e\n\nDF_BIGQUEY\x10\x0b\x12\x0f\n\x0b\x44\x46_INMEMORY\x10\x0c\x42\x1d\n\x19zingg.spark.connect.protoP\x01\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -22,6 +22,22 @@ if _descriptor._USE_C_DESCRIPTORS == False: _globals['DESCRIPTOR']._options = None _globals['DESCRIPTOR']._serialized_options = b'\n\031zingg.spark.connect.protoP\001' - _globals['_SUBMITZINGGJOB']._serialized_start=25 - _globals['_SUBMITZINGGJOB']._serialized_end=87 + _globals['_PIPE_PROPSENTRY']._options = None + _globals['_PIPE_PROPSENTRY']._serialized_options = b'8\001' + _globals['_MATCHTYPE']._serialized_start=1870 + _globals['_MATCHTYPE']._serialized_end=2092 + _globals['_DATAFORMAT']._serialized_start=2095 + _globals['_DATAFORMAT']._serialized_end=2299 + _globals['_SUBMITZINGGJOB']._serialized_start=26 + _globals['_SUBMITZINGGJOB']._serialized_end=195 + _globals['_FIELDDEFINITION']._serialized_start=198 + _globals['_FIELDDEFINITION']._serialized_end=454 + _globals['_PIPE']._serialized_start=457 + _globals['_PIPE']._serialized_end=709 + _globals['_PIPE_PROPSENTRY']._serialized_start=627 + _globals['_PIPE_PROPSENTRY']._serialized_end=683 + _globals['_ARGUMENTS']._serialized_start=712 + _globals['_ARGUMENTS']._serialized_end=1286 + _globals['_CLIENTOPTIONS']._serialized_start=1289 + _globals['_CLIENTOPTIONS']._serialized_end=1867 # @@protoc_insertion_point(module_scope) diff --git a/python/zingg_v2/proto/connect_plugins_pb2.pyi b/python/zingg_v2/proto/connect_plugins_pb2.pyi index a49afce1d..cd4ca8c65 100644 --- a/python/zingg_v2/proto/connect_plugins_pb2.pyi +++ b/python/zingg_v2/proto/connect_plugins_pb2.pyi @@ -3,31 +3,335 @@ isort:skip_file """ import builtins +import collections.abc import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper import google.protobuf.message import sys +import typing -if sys.version_info >= (3, 8): +if sys.version_info >= (3, 10): import typing as typing_extensions else: import typing_extensions DESCRIPTOR: google.protobuf.descriptor.FileDescriptor +class _MatchType: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + +class _MatchTypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_MatchType.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + MT_FUZZY: _MatchType.ValueType # 0 + MT_EXACT: _MatchType.ValueType # 1 + MT_DONT_USE: _MatchType.ValueType # 2 + MT_EMAIL: _MatchType.ValueType # 3 + MT_PINCODE: _MatchType.ValueType # 4 + MT_NULL_OR_BLANK: _MatchType.ValueType # 5 + MT_TEXT: _MatchType.ValueType # 6 + MT_NUMERIC: _MatchType.ValueType # 7 + MT_NUMERIC_WITH_UNITS: _MatchType.ValueType # 8 + MT_ONLY_ALPHABETS_EXACT: _MatchType.ValueType # 9 + MT_ONLY_ALPHABETS_FUZZY: _MatchType.ValueType # 10 + +class MatchType(_MatchType, metaclass=_MatchTypeEnumTypeWrapper): ... + +MT_FUZZY: MatchType.ValueType # 0 +MT_EXACT: MatchType.ValueType # 1 +MT_DONT_USE: MatchType.ValueType # 2 +MT_EMAIL: MatchType.ValueType # 3 +MT_PINCODE: MatchType.ValueType # 4 +MT_NULL_OR_BLANK: MatchType.ValueType # 5 +MT_TEXT: MatchType.ValueType # 6 +MT_NUMERIC: MatchType.ValueType # 7 +MT_NUMERIC_WITH_UNITS: MatchType.ValueType # 8 +MT_ONLY_ALPHABETS_EXACT: MatchType.ValueType # 9 +MT_ONLY_ALPHABETS_FUZZY: MatchType.ValueType # 10 +global___MatchType = MatchType + +class _DataFormat: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + +class _DataFormatEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_DataFormat.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + DF_CSV: _DataFormat.ValueType # 0 + DF_PARQUET: _DataFormat.ValueType # 1 + DF_JSON: _DataFormat.ValueType # 2 + DF_TEXT: _DataFormat.ValueType # 3 + DF_XLS: _DataFormat.ValueType # 4 + DF_AVRO: _DataFormat.ValueType # 5 + DF_JDBC: _DataFormat.ValueType # 6 + DF_CASSANDRA: _DataFormat.ValueType # 7 + DF_SNOWFLAKE: _DataFormat.ValueType # 8 + DF_ELASTIC: _DataFormat.ValueType # 9 + DF_EXACOL: _DataFormat.ValueType # 10 + DF_BIGQUEY: _DataFormat.ValueType # 11 + DF_INMEMORY: _DataFormat.ValueType # 12 + +class DataFormat(_DataFormat, metaclass=_DataFormatEnumTypeWrapper): ... + +DF_CSV: DataFormat.ValueType # 0 +DF_PARQUET: DataFormat.ValueType # 1 +DF_JSON: DataFormat.ValueType # 2 +DF_TEXT: DataFormat.ValueType # 3 +DF_XLS: DataFormat.ValueType # 4 +DF_AVRO: DataFormat.ValueType # 5 +DF_JDBC: DataFormat.ValueType # 6 +DF_CASSANDRA: DataFormat.ValueType # 7 +DF_SNOWFLAKE: DataFormat.ValueType # 8 +DF_ELASTIC: DataFormat.ValueType # 9 +DF_EXACOL: DataFormat.ValueType # 10 +DF_BIGQUEY: DataFormat.ValueType # 11 +DF_INMEMORY: DataFormat.ValueType # 12 +global___DataFormat = DataFormat + @typing_extensions.final class SubmitZinggJob(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor - ARGS_FIELD_NUMBER: builtins.int - OPTIONS_FIELD_NUMBER: builtins.int - args: builtins.str - options: builtins.str + ARGUMNETS_FIELD_NUMBER: builtins.int + CLI_OPTIONS_FIELD_NUMBER: builtins.int + IN_MEMORY_DATE_FIELD_NUMBER: builtins.int + @property + def argumnets(self) -> global___Arguments: ... + @property + def cli_options(self) -> global___ClientOptions: ... + in_memory_date: builtins.bytes + """The next message is a serialized LogicalPlan""" def __init__( self, *, - args: builtins.str = ..., - options: builtins.str = ..., + argumnets: global___Arguments | None = ..., + cli_options: global___ClientOptions | None = ..., + in_memory_date: builtins.bytes | None = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["args", b"args", "options", b"options"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_in_memory_date", b"_in_memory_date", "argumnets", b"argumnets", "cli_options", b"cli_options", "in_memory_date", b"in_memory_date"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_in_memory_date", b"_in_memory_date", "argumnets", b"argumnets", "cli_options", b"cli_options", "in_memory_date", b"in_memory_date"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["_in_memory_date", b"_in_memory_date"]) -> typing_extensions.Literal["in_memory_date"] | None: ... global___SubmitZinggJob = SubmitZinggJob + +@typing_extensions.final +class FieldDefinition(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + MATCH_TYPE_FIELD_NUMBER: builtins.int + DATA_TYPE_FIELD_NUMBER: builtins.int + FIELD_NAME_FIELD_NUMBER: builtins.int + FIELDS_FIELD_NUMBER: builtins.int + STOP_WORDS_FIELD_NUMBER: builtins.int + ABBREVIATIONS_FIELD_NUMBER: builtins.int + match_type: global___MatchType.ValueType + data_type: builtins.str + field_name: builtins.str + fields: builtins.str + stop_words: builtins.str + abbreviations: builtins.str + def __init__( + self, + *, + match_type: global___MatchType.ValueType = ..., + data_type: builtins.str = ..., + field_name: builtins.str = ..., + fields: builtins.str = ..., + stop_words: builtins.str | None = ..., + abbreviations: builtins.str | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_abbreviations", b"_abbreviations", "_stop_words", b"_stop_words", "abbreviations", b"abbreviations", "stop_words", b"stop_words"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_abbreviations", b"_abbreviations", "_stop_words", b"_stop_words", "abbreviations", b"abbreviations", "data_type", b"data_type", "field_name", b"field_name", "fields", b"fields", "match_type", b"match_type", "stop_words", b"stop_words"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_abbreviations", b"_abbreviations"]) -> typing_extensions.Literal["abbreviations"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_stop_words", b"_stop_words"]) -> typing_extensions.Literal["stop_words"] | None: ... + +global___FieldDefinition = FieldDefinition + +@typing_extensions.final +class Pipe(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class PropsEntry(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + key: builtins.str + value: builtins.str + def __init__( + self, + *, + key: builtins.str = ..., + value: builtins.str = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"]) -> None: ... + + NAME_FIELD_NUMBER: builtins.int + FORMAT_FIELD_NUMBER: builtins.int + PROPS_FIELD_NUMBER: builtins.int + SCHEMA_FIELD_FIELD_NUMBER: builtins.int + MODE_FIELD_NUMBER: builtins.int + name: builtins.str + format: global___DataFormat.ValueType + @property + def props(self) -> google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]: ... + schema_field: builtins.str + mode: builtins.str + def __init__( + self, + *, + name: builtins.str = ..., + format: global___DataFormat.ValueType = ..., + props: collections.abc.Mapping[builtins.str, builtins.str] | None = ..., + schema_field: builtins.str | None = ..., + mode: builtins.str | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_mode", b"_mode", "_schema_field", b"_schema_field", "mode", b"mode", "schema_field", b"schema_field"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_mode", b"_mode", "_schema_field", b"_schema_field", "format", b"format", "mode", b"mode", "name", b"name", "props", b"props", "schema_field", b"schema_field"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_mode", b"_mode"]) -> typing_extensions.Literal["mode"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_schema_field", b"_schema_field"]) -> typing_extensions.Literal["schema_field"] | None: ... + +global___Pipe = Pipe + +@typing_extensions.final +class Arguments(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OUTPUT_FIELD_NUMBER: builtins.int + DATA_FIELD_NUMBER: builtins.int + ZINGG_DIR_FIELD_NUMBER: builtins.int + TRAINING_SAMPLES_FIELD_NUMBER: builtins.int + FIIELD_DEFINITION_FIELD_NUMBER: builtins.int + NUM_PARTITIONS_FIELD_NUMBER: builtins.int + LABEL_DATA_SAMPLE_SIZE_FIELD_NUMBER: builtins.int + MODEL_ID_FIELD_NUMBER: builtins.int + THRESHOLD_FIELD_NUMBER: builtins.int + JOB_ID_FIELD_NUMBER: builtins.int + COLLECT_METRICS_FIELD_NUMBER: builtins.int + SHOW_CONCISE_FIELD_NUMBER: builtins.int + STOP_WORDS_CUTOFF_FIELD_NUMBER: builtins.int + BLOCK_SIZE_FIELD_NUMBER: builtins.int + COLUMN_FIELD_NUMBER: builtins.int + @property + def output(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Pipe]: ... + @property + def data(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Pipe]: ... + zingg_dir: builtins.str + @property + def training_samples(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Pipe]: ... + @property + def fiield_definition(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___FieldDefinition]: ... + num_partitions: builtins.int + label_data_sample_size: builtins.float + model_id: builtins.str + threshold: builtins.float + job_id: builtins.int + collect_metrics: builtins.bool + show_concise: builtins.bool + stop_words_cutoff: builtins.float + block_size: builtins.int + column: builtins.str + def __init__( + self, + *, + output: collections.abc.Iterable[global___Pipe] | None = ..., + data: collections.abc.Iterable[global___Pipe] | None = ..., + zingg_dir: builtins.str = ..., + training_samples: collections.abc.Iterable[global___Pipe] | None = ..., + fiield_definition: collections.abc.Iterable[global___FieldDefinition] | None = ..., + num_partitions: builtins.int = ..., + label_data_sample_size: builtins.float = ..., + model_id: builtins.str = ..., + threshold: builtins.float = ..., + job_id: builtins.int = ..., + collect_metrics: builtins.bool = ..., + show_concise: builtins.bool = ..., + stop_words_cutoff: builtins.float = ..., + block_size: builtins.int = ..., + column: builtins.str | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_column", b"_column", "column", b"column"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_column", b"_column", "block_size", b"block_size", "collect_metrics", b"collect_metrics", "column", b"column", "data", b"data", "fiield_definition", b"fiield_definition", "job_id", b"job_id", "label_data_sample_size", b"label_data_sample_size", "model_id", b"model_id", "num_partitions", b"num_partitions", "output", b"output", "show_concise", b"show_concise", "stop_words_cutoff", b"stop_words_cutoff", "threshold", b"threshold", "training_samples", b"training_samples", "zingg_dir", b"zingg_dir"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["_column", b"_column"]) -> typing_extensions.Literal["column"] | None: ... + +global___Arguments = Arguments + +@typing_extensions.final +class ClientOptions(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + PHASE_FIELD_NUMBER: builtins.int + LICENSE_FIELD_NUMBER: builtins.int + EMAIL_FIELD_NUMBER: builtins.int + CONF_FIELD_NUMBER: builtins.int + PREPROCESS_FIELD_NUMBER: builtins.int + JOB_ID_FIELD_NUMBER: builtins.int + FORMAT_FIELD_NUMBER: builtins.int + ZINGG_DIR_FIELD_NUMBER: builtins.int + MODEL_ID_FIELD_NUMBER: builtins.int + COLLECT_METRICS_FIELD_NUMBER: builtins.int + SHOW_CONCISE_FIELD_NUMBER: builtins.int + LOCATION_FIELD_NUMBER: builtins.int + COLUMN_FIELD_NUMBER: builtins.int + REMOTE_FIELD_NUMBER: builtins.int + phase: builtins.str + license: builtins.str + email: builtins.str + conf: builtins.str + preprocess: builtins.str + job_id: builtins.str + format: builtins.str + zingg_dir: builtins.str + model_id: builtins.str + collect_metrics: builtins.str + show_concise: builtins.str + location: builtins.str + column: builtins.str + remote: builtins.str + def __init__( + self, + *, + phase: builtins.str = ..., + license: builtins.str = ..., + email: builtins.str = ..., + conf: builtins.str = ..., + preprocess: builtins.str | None = ..., + job_id: builtins.str | None = ..., + format: builtins.str | None = ..., + zingg_dir: builtins.str | None = ..., + model_id: builtins.str | None = ..., + collect_metrics: builtins.str | None = ..., + show_concise: builtins.str | None = ..., + location: builtins.str | None = ..., + column: builtins.str | None = ..., + remote: builtins.str | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_collect_metrics", b"_collect_metrics", "_column", b"_column", "_format", b"_format", "_job_id", b"_job_id", "_location", b"_location", "_model_id", b"_model_id", "_preprocess", b"_preprocess", "_remote", b"_remote", "_show_concise", b"_show_concise", "_zingg_dir", b"_zingg_dir", "collect_metrics", b"collect_metrics", "column", b"column", "format", b"format", "job_id", b"job_id", "location", b"location", "model_id", b"model_id", "preprocess", b"preprocess", "remote", b"remote", "show_concise", b"show_concise", "zingg_dir", b"zingg_dir"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_collect_metrics", b"_collect_metrics", "_column", b"_column", "_format", b"_format", "_job_id", b"_job_id", "_location", b"_location", "_model_id", b"_model_id", "_preprocess", b"_preprocess", "_remote", b"_remote", "_show_concise", b"_show_concise", "_zingg_dir", b"_zingg_dir", "collect_metrics", b"collect_metrics", "column", b"column", "conf", b"conf", "email", b"email", "format", b"format", "job_id", b"job_id", "license", b"license", "location", b"location", "model_id", b"model_id", "phase", b"phase", "preprocess", b"preprocess", "remote", b"remote", "show_concise", b"show_concise", "zingg_dir", b"zingg_dir"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_collect_metrics", b"_collect_metrics"]) -> typing_extensions.Literal["collect_metrics"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_column", b"_column"]) -> typing_extensions.Literal["column"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_format", b"_format"]) -> typing_extensions.Literal["format"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_job_id", b"_job_id"]) -> typing_extensions.Literal["job_id"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_location", b"_location"]) -> typing_extensions.Literal["location"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_model_id", b"_model_id"]) -> typing_extensions.Literal["model_id"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_preprocess", b"_preprocess"]) -> typing_extensions.Literal["preprocess"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_remote", b"_remote"]) -> typing_extensions.Literal["remote"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_show_concise", b"_show_concise"]) -> typing_extensions.Literal["show_concise"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_zingg_dir", b"_zingg_dir"]) -> typing_extensions.Literal["zingg_dir"] | None: ... + +global___ClientOptions = ClientOptions diff --git a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java index e65f1941f..1463f12b0 100644 --- a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java +++ b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java @@ -2,7 +2,6 @@ import com.google.protobuf.Any; import com.google.protobuf.InvalidProtocolBufferException; -import com.sun.tools.javac.util.List; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -13,14 +12,117 @@ import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; + +import scala.Option; import zingg.common.client.*; +import zingg.common.client.Arguments; +import zingg.common.client.ClientOptions; +import zingg.spark.connect.*; +import zingg.spark.connect.proto.*; +import zingg.common.client.pipe.Pipe; import zingg.spark.client.SparkClient; -import zingg.spark.connect.proto.SubmitZinggJob; +import zingg.spark.client.pipe.SparkPipe; +import java.util.Map; import java.util.Optional; +import java.util.HashMap; +import java.util.List; public class ZinggConnectPlugin implements RelationPlugin { + private SparkPipe parsePipe(zingg.spark.connect.proto.Pipe protoPipe) { + SparkPipe sparkPipe = new SparkPipe(); + sparkPipe.setName(protoPipe.getName()); + + // Parse DataFormat from proto + DataFormat dataFormatProto = protoPipe.getFormat(); + if (dataFormatProto == DataFormat.DF_AVRO) { + sparkPipe.setFormat(Pipe.FORMAT_AVRO); + } else if (dataFormatProto == DataFormat.DF_BIGQUEY) { + sparkPipe.setFormat(Pipe.FORMAT_BIGQUERY); + } else if (dataFormatProto == DataFormat.DF_CASSANDRA) { + sparkPipe.setFormat(Pipe.FORMAT_CASSANDRA); + } else if (dataFormatProto == DataFormat.DF_CSV) { + sparkPipe.setFormat(Pipe.FORMAT_CSV); + } else if (dataFormatProto == DataFormat.DF_ELASTIC) { + sparkPipe.setFormat(Pipe.FORMAT_ELASTIC); + } else if (dataFormatProto == DataFormat.DF_EXACOL) { + sparkPipe.setFormat(Pipe.FORMAT_EXASOL); + } else if (dataFormatProto == DataFormat.DF_INMEMORY) { + sparkPipe.setFormat(Pipe.FORMAT_INMEMORY); + } else if (dataFormatProto == DataFormat.DF_JDBC) { + sparkPipe.setFormat(Pipe.FORMAT_JDBC); + } else if (dataFormatProto == DataFormat.DF_JSON) { + sparkPipe.setFormat(Pipe.FORMAT_JSON); + } else if (dataFormatProto == DataFormat.DF_PARQUET) { + sparkPipe.setFormat(Pipe.FORMAT_PARQUET); + } else if (dataFormatProto == DataFormat.DF_SNOWFLAKE) { + sparkPipe.setFormat(Pipe.FORMAT_SNOWFLAKE); + } else if (dataFormatProto == DataFormat.DF_TEXT) { + sparkPipe.setFormat(Pipe.FORMAT_TEXT); + } else if (dataFormatProto == DataFormat.DF_XLS) { + sparkPipe.setFormat(Pipe.FORMAT_XLS); + } else { + throw new RuntimeException(String.format("Unknown format %s", dataFormatProto.name())); + } + + // Parse tags + for (Map.Entry kv : protoPipe.getPropsMap().entrySet()) { + sparkPipe.setProp(kv.getKey(), kv.getValue()); + } + + if (protoPipe.hasSchemaField()) { + sparkPipe.setSchema(protoPipe.getSchemaField()); + } + + if (protoPipe.hasMode()) { + sparkPipe.setMode(protoPipe.getMode()); + } + + return sparkPipe; + } + + private SparkPipe[] parsePipes(List protoPipes) { + return protoPipes.stream().map(protoPipe -> parsePipe(protoPipe)).toArray(SparkPipe[]::new); + } + + // 3.5.2 behaviour + // Because of shading rules this method may be marked as wrongly overriden @Override + public Option transform(Any relation, SparkConnectPlanner planner) { + if (relation.is(SubmitZinggJob.class)) { + SubmitZinggJob zinggJobProto = relation.unpack(SubmitZinggJob.class); + // It is expected that the session exisits! + SparkSession spark = planner.sessionHolder().session(); + IArguments arguments = new Arguments(); + // Parse arguments + + // Output pipes + arguments.setOutput(parsePipes(zinggJobProto.getArgumnets().getOutputList())); + // Data pipes + arguments.setData(parsePipes(zinggJobProto.getArgumnets().getDataList())); + // Training samples + arguments.setTrainingSamples(parsePipes(zinggJobProto.getArgumnets().getTrainingSamplesList())); + + // Arguments + arguments.setZinggDir(zinggJobProto.getArgumnets().getZinggDir()); + arguments.setNumPartitions(zinggJobProto.getArgumnets().getNumPartitions()); + arguments.setLabelDataSampleSize(zinggJobProto.getArgumnets().getLabelDataSampleSize()); + arguments.setModelId(zinggJobProto.getArgumnets().getModelId()); + arguments.setThreshold(zinggJobProto.getArgumnets().getThreshold()); + arguments.setJobId(zinggJobProto.getArgumnets().getJobId()); + arguments.setCollectMetrics(zinggJobProto.getArgumnets().getCollectMetrics()); + arguments.setShowConcise(zinggJobProto.getArgumnets().getShowConcise()); + arguments.setStopWordsCutoff(zinggJobProto.getArgumnets().getStopWordsCutoff()); + arguments.setBlockSize(zinggJobProto.getArgumnets().getBlockSize()); + if (zinggJobProto.getArgumnets().hasColumn()) { + arguments.setColumn(zinggJobProto.getArgumnets().getColumn()); + } + + // Options + zingg.spark.connect.proto.ClientOptions clientOptionsProto = zinggJobProto.getCliOptions(); + } + } + public Optional transform(byte[] bytes, SparkConnectPlanner sparkConnectPlanner) { Any command; try { @@ -44,14 +146,11 @@ public Optional transform(byte[] bytes, SparkConnectPlanner sparkCo List.of( RowFactory.create( "SUCCESS", - new ArgumentsUtil().writeArgumentstoJSONString(client.getArguments()) - ) - ), - new StructType(new StructField[]{ + new ArgumentsUtil().writeArgumentstoJSONString(client.getArguments()))), + new StructType(new StructField[] { DataTypes.createStructField("status", DataTypes.StringType, false), DataTypes.createStructField("newArgs", DataTypes.StringType, false) - }) - ); + })); return Optional.of(outDF.logicalPlan()); } } diff --git a/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java b/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java index 506523040..d7eddb452 100644 --- a/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java +++ b/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java @@ -20,6 +20,31 @@ public static void registerAllExtensions( static final com.google.protobuf.GeneratedMessageV3.FieldAccessorTable internal_static_SubmitZinggJob_fieldAccessorTable; + static final com.google.protobuf.Descriptors.Descriptor + internal_static_FieldDefinition_descriptor; + static final + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_FieldDefinition_fieldAccessorTable; + static final com.google.protobuf.Descriptors.Descriptor + internal_static_Pipe_descriptor; + static final + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_Pipe_fieldAccessorTable; + static final com.google.protobuf.Descriptors.Descriptor + internal_static_Pipe_PropsEntry_descriptor; + static final + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_Pipe_PropsEntry_fieldAccessorTable; + static final com.google.protobuf.Descriptors.Descriptor + internal_static_Arguments_descriptor; + static final + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_Arguments_fieldAccessorTable; + static final com.google.protobuf.Descriptors.Descriptor + internal_static_ClientOptions_descriptor; + static final + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_ClientOptions_fieldAccessorTable; public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() { @@ -29,10 +54,65 @@ public static void registerAllExtensions( descriptor; static { java.lang.String[] descriptorData = { - "\n\025connect_plugins.proto\">\n\016SubmitZinggJo" + - "b\022\022\n\004args\030\001 \001(\tR\004args\022\030\n\007options\030\002 \001(\tR\007" + - "optionsB\035\n\031zingg.spark.connect.protoP\001b\006" + - "proto3" + "\n\025connect_plugins.proto\"\251\001\n\016SubmitZinggJ" + + "ob\022(\n\targumnets\030\001 \001(\0132\n.ArgumentsR\targum" + + "nets\022/\n\013cli_options\030\002 \001(\0132\016.ClientOption" + + "sR\ncliOptions\022)\n\016in_memory_date\030\003 \001(\014H\000R" + + "\014inMemoryDate\210\001\001B\021\n\017_in_memory_date\"\200\002\n\017" + + "FieldDefinition\022)\n\nmatch_type\030\001 \001(\0162\n.Ma" + + "tchTypeR\tmatchType\022\033\n\tdata_type\030\002 \001(\tR\010d" + + "ataType\022\035\n\nfield_name\030\003 \001(\tR\tfieldName\022\026" + + "\n\006fields\030\004 \001(\tR\006fields\022\"\n\nstop_words\030\005 \001" + + "(\tH\000R\tstopWords\210\001\001\022)\n\rabbreviations\030\006 \001(" + + "\tH\001R\rabbreviations\210\001\001B\r\n\013_stop_wordsB\020\n\016" + + "_abbreviations\"\374\001\n\004Pipe\022\022\n\004name\030\001 \001(\tR\004n" + + "ame\022#\n\006format\030\002 \001(\0162\013.DataFormatR\006format" + + "\022&\n\005props\030\003 \003(\0132\020.Pipe.PropsEntryR\005props" + + "\022&\n\014schema_field\030\004 \001(\tH\000R\013schemaField\210\001\001" + + "\022\027\n\004mode\030\005 \001(\tH\001R\004mode\210\001\001\0328\n\nPropsEntry\022" + + "\020\n\003key\030\001 \001(\tR\003key\022\024\n\005value\030\002 \001(\tR\005value:" + + "\0028\001B\017\n\r_schema_fieldB\007\n\005_mode\"\276\004\n\tArgume" + + "nts\022\035\n\006output\030\001 \003(\0132\005.PipeR\006output\022\031\n\004da" + + "ta\030\002 \003(\0132\005.PipeR\004data\022\033\n\tzingg_dir\030\003 \001(\t" + + "R\010zinggDir\0220\n\020training_samples\030\004 \003(\0132\005.P" + + "ipeR\017trainingSamples\022=\n\021fiield_definitio" + + "n\030\005 \003(\0132\020.FieldDefinitionR\020fiieldDefinit" + + "ion\022%\n\016num_partitions\030\006 \001(\005R\rnumPartitio" + + "ns\0223\n\026label_data_sample_size\030\007 \001(\002R\023labe" + + "lDataSampleSize\022\031\n\010model_id\030\010 \001(\tR\007model" + + "Id\022\034\n\tthreshold\030\t \001(\002R\tthreshold\022\025\n\006job_" + + "id\030\n \001(\005R\005jobId\022\'\n\017collect_metrics\030\013 \001(\010" + + "R\016collectMetrics\022!\n\014show_concise\030\014 \001(\010R\013" + + "showConcise\022*\n\021stop_words_cutoff\030\r \001(\002R\017" + + "stopWordsCutoff\022\035\n\nblock_size\030\016 \001(\005R\tblo" + + "ckSize\022\033\n\006column\030\017 \001(\tH\000R\006column\210\001\001B\t\n\007_" + + "column\"\302\004\n\rClientOptions\022\024\n\005phase\030\001 \001(\tR" + + "\005phase\022\030\n\007license\030\002 \001(\tR\007license\022\024\n\005emai" + + "l\030\003 \001(\tR\005email\022\022\n\004conf\030\004 \001(\tR\004conf\022#\n\npr" + + "eprocess\030\005 \001(\tH\000R\npreprocess\210\001\001\022\032\n\006job_i" + + "d\030\006 \001(\tH\001R\005jobId\210\001\001\022\033\n\006format\030\007 \001(\tH\002R\006f" + + "ormat\210\001\001\022 \n\tzingg_dir\030\010 \001(\tH\003R\010zinggDir\210" + + "\001\001\022\036\n\010model_id\030\t \001(\tH\004R\007modelId\210\001\001\022,\n\017co" + + "llect_metrics\030\n \001(\tH\005R\016collectMetrics\210\001\001" + + "\022&\n\014show_concise\030\013 \001(\tH\006R\013showConcise\210\001\001" + + "\022\037\n\010location\030\014 \001(\tH\007R\010location\210\001\001\022\033\n\006col" + + "umn\030\r \001(\tH\010R\006column\210\001\001\022\033\n\006remote\030\016 \001(\tH\t" + + "R\006remote\210\001\001B\r\n\013_preprocessB\t\n\007_job_idB\t\n" + + "\007_formatB\014\n\n_zingg_dirB\013\n\t_model_idB\022\n\020_" + + "collect_metricsB\017\n\r_show_conciseB\013\n\t_loc" + + "ationB\t\n\007_columnB\t\n\007_remote*\336\001\n\tMatchTyp" + + "e\022\014\n\010MT_FUZZY\020\000\022\014\n\010MT_EXACT\020\001\022\017\n\013MT_DONT" + + "_USE\020\002\022\014\n\010MT_EMAIL\020\003\022\016\n\nMT_PINCODE\020\004\022\024\n\020" + + "MT_NULL_OR_BLANK\020\005\022\013\n\007MT_TEXT\020\006\022\016\n\nMT_NU" + + "MERIC\020\007\022\031\n\025MT_NUMERIC_WITH_UNITS\020\010\022\033\n\027MT" + + "_ONLY_ALPHABETS_EXACT\020\t\022\033\n\027MT_ONLY_ALPHA" + + "BETS_FUZZY\020\n*\314\001\n\nDataFormat\022\n\n\006DF_CSV\020\000\022" + + "\016\n\nDF_PARQUET\020\001\022\013\n\007DF_JSON\020\002\022\013\n\007DF_TEXT\020" + + "\003\022\n\n\006DF_XLS\020\004\022\013\n\007DF_AVRO\020\005\022\013\n\007DF_JDBC\020\006\022" + + "\020\n\014DF_CASSANDRA\020\007\022\020\n\014DF_SNOWFLAKE\020\010\022\016\n\nD" + + "F_ELASTIC\020\t\022\r\n\tDF_EXACOL\020\n\022\016\n\nDF_BIGQUEY" + + "\020\013\022\017\n\013DF_INMEMORY\020\014B\035\n\031zingg.spark.conne" + + "ct.protoP\001b\006proto3" }; descriptor = com.google.protobuf.Descriptors.FileDescriptor .internalBuildGeneratedFileFrom(descriptorData, @@ -43,7 +123,37 @@ public static void registerAllExtensions( internal_static_SubmitZinggJob_fieldAccessorTable = new com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( internal_static_SubmitZinggJob_descriptor, - new java.lang.String[] { "Args", "Options", }); + new java.lang.String[] { "Argumnets", "CliOptions", "InMemoryDate", }); + internal_static_FieldDefinition_descriptor = + getDescriptor().getMessageTypes().get(1); + internal_static_FieldDefinition_fieldAccessorTable = new + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_FieldDefinition_descriptor, + new java.lang.String[] { "MatchType", "DataType", "FieldName", "Fields", "StopWords", "Abbreviations", }); + internal_static_Pipe_descriptor = + getDescriptor().getMessageTypes().get(2); + internal_static_Pipe_fieldAccessorTable = new + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_Pipe_descriptor, + new java.lang.String[] { "Name", "Format", "Props", "SchemaField", "Mode", }); + internal_static_Pipe_PropsEntry_descriptor = + internal_static_Pipe_descriptor.getNestedTypes().get(0); + internal_static_Pipe_PropsEntry_fieldAccessorTable = new + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_Pipe_PropsEntry_descriptor, + new java.lang.String[] { "Key", "Value", }); + internal_static_Arguments_descriptor = + getDescriptor().getMessageTypes().get(3); + internal_static_Arguments_fieldAccessorTable = new + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_Arguments_descriptor, + new java.lang.String[] { "Output", "Data", "ZinggDir", "TrainingSamples", "FiieldDefinition", "NumPartitions", "LabelDataSampleSize", "ModelId", "Threshold", "JobId", "CollectMetrics", "ShowConcise", "StopWordsCutoff", "BlockSize", "Column", }); + internal_static_ClientOptions_descriptor = + getDescriptor().getMessageTypes().get(4); + internal_static_ClientOptions_fieldAccessorTable = new + com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_ClientOptions_descriptor, + new java.lang.String[] { "Phase", "License", "Email", "Conf", "Preprocess", "JobId", "Format", "ZinggDir", "ModelId", "CollectMetrics", "ShowConcise", "Location", "Column", "Remote", }); } // @@protoc_insertion_point(outer_class_scope) diff --git a/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java index 4a74b18a4..fdf0377fc 100644 --- a/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java +++ b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJob.java @@ -17,8 +17,7 @@ private SubmitZinggJob(com.google.protobuf.GeneratedMessageV3.Builder builder super(builder); } private SubmitZinggJob() { - args_ = ""; - options_ = ""; + inMemoryDate_ = com.google.protobuf.ByteString.EMPTY; } @java.lang.Override @@ -41,82 +40,84 @@ protected java.lang.Object newInstance( zingg.spark.connect.proto.SubmitZinggJob.class, zingg.spark.connect.proto.SubmitZinggJob.Builder.class); } - public static final int ARGS_FIELD_NUMBER = 1; - @SuppressWarnings("serial") - private volatile java.lang.Object args_ = ""; + private int bitField0_; + public static final int ARGUMNETS_FIELD_NUMBER = 1; + private zingg.spark.connect.proto.Arguments argumnets_; /** - * string args = 1 [json_name = "args"]; - * @return The args. + * .Arguments argumnets = 1 [json_name = "argumnets"]; + * @return Whether the argumnets field is set. */ @java.lang.Override - public java.lang.String getArgs() { - java.lang.Object ref = args_; - if (ref instanceof java.lang.String) { - return (java.lang.String) ref; - } else { - com.google.protobuf.ByteString bs = - (com.google.protobuf.ByteString) ref; - java.lang.String s = bs.toStringUtf8(); - args_ = s; - return s; - } + public boolean hasArgumnets() { + return ((bitField0_ & 0x00000001) != 0); } /** - * string args = 1 [json_name = "args"]; - * @return The bytes for args. + * .Arguments argumnets = 1 [json_name = "argumnets"]; + * @return The argumnets. */ @java.lang.Override - public com.google.protobuf.ByteString - getArgsBytes() { - java.lang.Object ref = args_; - if (ref instanceof java.lang.String) { - com.google.protobuf.ByteString b = - com.google.protobuf.ByteString.copyFromUtf8( - (java.lang.String) ref); - args_ = b; - return b; - } else { - return (com.google.protobuf.ByteString) ref; - } - } - - public static final int OPTIONS_FIELD_NUMBER = 2; - @SuppressWarnings("serial") - private volatile java.lang.Object options_ = ""; + public zingg.spark.connect.proto.Arguments getArgumnets() { + return argumnets_ == null ? zingg.spark.connect.proto.Arguments.getDefaultInstance() : argumnets_; + } /** - * string options = 2 [json_name = "options"]; - * @return The options. + * .Arguments argumnets = 1 [json_name = "argumnets"]; */ @java.lang.Override - public java.lang.String getOptions() { - java.lang.Object ref = options_; - if (ref instanceof java.lang.String) { - return (java.lang.String) ref; - } else { - com.google.protobuf.ByteString bs = - (com.google.protobuf.ByteString) ref; - java.lang.String s = bs.toStringUtf8(); - options_ = s; - return s; - } + public zingg.spark.connect.proto.ArgumentsOrBuilder getArgumnetsOrBuilder() { + return argumnets_ == null ? zingg.spark.connect.proto.Arguments.getDefaultInstance() : argumnets_; } + + public static final int CLI_OPTIONS_FIELD_NUMBER = 2; + private zingg.spark.connect.proto.ClientOptions cliOptions_; /** - * string options = 2 [json_name = "options"]; - * @return The bytes for options. + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + * @return Whether the cliOptions field is set. */ @java.lang.Override - public com.google.protobuf.ByteString - getOptionsBytes() { - java.lang.Object ref = options_; - if (ref instanceof java.lang.String) { - com.google.protobuf.ByteString b = - com.google.protobuf.ByteString.copyFromUtf8( - (java.lang.String) ref); - options_ = b; - return b; - } else { - return (com.google.protobuf.ByteString) ref; - } + public boolean hasCliOptions() { + return ((bitField0_ & 0x00000002) != 0); + } + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + * @return The cliOptions. + */ + @java.lang.Override + public zingg.spark.connect.proto.ClientOptions getCliOptions() { + return cliOptions_ == null ? zingg.spark.connect.proto.ClientOptions.getDefaultInstance() : cliOptions_; + } + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + */ + @java.lang.Override + public zingg.spark.connect.proto.ClientOptionsOrBuilder getCliOptionsOrBuilder() { + return cliOptions_ == null ? zingg.spark.connect.proto.ClientOptions.getDefaultInstance() : cliOptions_; + } + + public static final int IN_MEMORY_DATE_FIELD_NUMBER = 3; + private com.google.protobuf.ByteString inMemoryDate_ = com.google.protobuf.ByteString.EMPTY; + /** + *

+   * The next message is a serialized LogicalPlan
+   * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @return Whether the inMemoryDate field is set. + */ + @java.lang.Override + public boolean hasInMemoryDate() { + return ((bitField0_ & 0x00000004) != 0); + } + /** + *
+   * The next message is a serialized LogicalPlan
+   * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @return The inMemoryDate. + */ + @java.lang.Override + public com.google.protobuf.ByteString getInMemoryDate() { + return inMemoryDate_; } private byte memoizedIsInitialized = -1; @@ -133,11 +134,14 @@ public final boolean isInitialized() { @java.lang.Override public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException { - if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(args_)) { - com.google.protobuf.GeneratedMessageV3.writeString(output, 1, args_); + if (((bitField0_ & 0x00000001) != 0)) { + output.writeMessage(1, getArgumnets()); } - if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(options_)) { - com.google.protobuf.GeneratedMessageV3.writeString(output, 2, options_); + if (((bitField0_ & 0x00000002) != 0)) { + output.writeMessage(2, getCliOptions()); + } + if (((bitField0_ & 0x00000004) != 0)) { + output.writeBytes(3, inMemoryDate_); } getUnknownFields().writeTo(output); } @@ -148,11 +152,17 @@ public int getSerializedSize() { if (size != -1) return size; size = 0; - if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(args_)) { - size += com.google.protobuf.GeneratedMessageV3.computeStringSize(1, args_); + if (((bitField0_ & 0x00000001) != 0)) { + size += com.google.protobuf.CodedOutputStream + .computeMessageSize(1, getArgumnets()); + } + if (((bitField0_ & 0x00000002) != 0)) { + size += com.google.protobuf.CodedOutputStream + .computeMessageSize(2, getCliOptions()); } - if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(options_)) { - size += com.google.protobuf.GeneratedMessageV3.computeStringSize(2, options_); + if (((bitField0_ & 0x00000004) != 0)) { + size += com.google.protobuf.CodedOutputStream + .computeBytesSize(3, inMemoryDate_); } size += getUnknownFields().getSerializedSize(); memoizedSize = size; @@ -169,10 +179,21 @@ public boolean equals(final java.lang.Object obj) { } zingg.spark.connect.proto.SubmitZinggJob other = (zingg.spark.connect.proto.SubmitZinggJob) obj; - if (!getArgs() - .equals(other.getArgs())) return false; - if (!getOptions() - .equals(other.getOptions())) return false; + if (hasArgumnets() != other.hasArgumnets()) return false; + if (hasArgumnets()) { + if (!getArgumnets() + .equals(other.getArgumnets())) return false; + } + if (hasCliOptions() != other.hasCliOptions()) return false; + if (hasCliOptions()) { + if (!getCliOptions() + .equals(other.getCliOptions())) return false; + } + if (hasInMemoryDate() != other.hasInMemoryDate()) return false; + if (hasInMemoryDate()) { + if (!getInMemoryDate() + .equals(other.getInMemoryDate())) return false; + } if (!getUnknownFields().equals(other.getUnknownFields())) return false; return true; } @@ -184,10 +205,18 @@ public int hashCode() { } int hash = 41; hash = (19 * hash) + getDescriptor().hashCode(); - hash = (37 * hash) + ARGS_FIELD_NUMBER; - hash = (53 * hash) + getArgs().hashCode(); - hash = (37 * hash) + OPTIONS_FIELD_NUMBER; - hash = (53 * hash) + getOptions().hashCode(); + if (hasArgumnets()) { + hash = (37 * hash) + ARGUMNETS_FIELD_NUMBER; + hash = (53 * hash) + getArgumnets().hashCode(); + } + if (hasCliOptions()) { + hash = (37 * hash) + CLI_OPTIONS_FIELD_NUMBER; + hash = (53 * hash) + getCliOptions().hashCode(); + } + if (hasInMemoryDate()) { + hash = (37 * hash) + IN_MEMORY_DATE_FIELD_NUMBER; + hash = (53 * hash) + getInMemoryDate().hashCode(); + } hash = (29 * hash) + getUnknownFields().hashCode(); memoizedHashCode = hash; return hash; @@ -307,20 +336,36 @@ public static final class Builder extends // Construct using zingg.spark.connect.proto.SubmitZinggJob.newBuilder() private Builder() { - + maybeForceBuilderInitialization(); } private Builder( com.google.protobuf.GeneratedMessageV3.BuilderParent parent) { super(parent); - + maybeForceBuilderInitialization(); + } + private void maybeForceBuilderInitialization() { + if (com.google.protobuf.GeneratedMessageV3 + .alwaysUseFieldBuilders) { + getArgumnetsFieldBuilder(); + getCliOptionsFieldBuilder(); + } } @java.lang.Override public Builder clear() { super.clear(); bitField0_ = 0; - args_ = ""; - options_ = ""; + argumnets_ = null; + if (argumnetsBuilder_ != null) { + argumnetsBuilder_.dispose(); + argumnetsBuilder_ = null; + } + cliOptions_ = null; + if (cliOptionsBuilder_ != null) { + cliOptionsBuilder_.dispose(); + cliOptionsBuilder_ = null; + } + inMemoryDate_ = com.google.protobuf.ByteString.EMPTY; return this; } @@ -354,12 +399,24 @@ public zingg.spark.connect.proto.SubmitZinggJob buildPartial() { private void buildPartial0(zingg.spark.connect.proto.SubmitZinggJob result) { int from_bitField0_ = bitField0_; + int to_bitField0_ = 0; if (((from_bitField0_ & 0x00000001) != 0)) { - result.args_ = args_; + result.argumnets_ = argumnetsBuilder_ == null + ? argumnets_ + : argumnetsBuilder_.build(); + to_bitField0_ |= 0x00000001; } if (((from_bitField0_ & 0x00000002) != 0)) { - result.options_ = options_; + result.cliOptions_ = cliOptionsBuilder_ == null + ? cliOptions_ + : cliOptionsBuilder_.build(); + to_bitField0_ |= 0x00000002; } + if (((from_bitField0_ & 0x00000004) != 0)) { + result.inMemoryDate_ = inMemoryDate_; + to_bitField0_ |= 0x00000004; + } + result.bitField0_ |= to_bitField0_; } @java.lang.Override @@ -406,15 +463,14 @@ public Builder mergeFrom(com.google.protobuf.Message other) { public Builder mergeFrom(zingg.spark.connect.proto.SubmitZinggJob other) { if (other == zingg.spark.connect.proto.SubmitZinggJob.getDefaultInstance()) return this; - if (!other.getArgs().isEmpty()) { - args_ = other.args_; - bitField0_ |= 0x00000001; - onChanged(); + if (other.hasArgumnets()) { + mergeArgumnets(other.getArgumnets()); } - if (!other.getOptions().isEmpty()) { - options_ = other.options_; - bitField0_ |= 0x00000002; - onChanged(); + if (other.hasCliOptions()) { + mergeCliOptions(other.getCliOptions()); + } + if (other.hasInMemoryDate()) { + setInMemoryDate(other.getInMemoryDate()); } this.mergeUnknownFields(other.getUnknownFields()); onChanged(); @@ -443,15 +499,24 @@ public Builder mergeFrom( done = true; break; case 10: { - args_ = input.readStringRequireUtf8(); + input.readMessage( + getArgumnetsFieldBuilder().getBuilder(), + extensionRegistry); bitField0_ |= 0x00000001; break; } // case 10 case 18: { - options_ = input.readStringRequireUtf8(); + input.readMessage( + getCliOptionsFieldBuilder().getBuilder(), + extensionRegistry); bitField0_ |= 0x00000002; break; } // case 18 + case 26: { + inMemoryDate_ = input.readBytes(); + bitField0_ |= 0x00000004; + break; + } // case 26 default: { if (!super.parseUnknownField(input, extensionRegistry, tag)) { done = true; // was an endgroup tag @@ -469,146 +534,300 @@ public Builder mergeFrom( } private int bitField0_; - private java.lang.Object args_ = ""; + private zingg.spark.connect.proto.Arguments argumnets_; + private com.google.protobuf.SingleFieldBuilderV3< + zingg.spark.connect.proto.Arguments, zingg.spark.connect.proto.Arguments.Builder, zingg.spark.connect.proto.ArgumentsOrBuilder> argumnetsBuilder_; + /** + * .Arguments argumnets = 1 [json_name = "argumnets"]; + * @return Whether the argumnets field is set. + */ + public boolean hasArgumnets() { + return ((bitField0_ & 0x00000001) != 0); + } /** - * string args = 1 [json_name = "args"]; - * @return The args. + * .Arguments argumnets = 1 [json_name = "argumnets"]; + * @return The argumnets. */ - public java.lang.String getArgs() { - java.lang.Object ref = args_; - if (!(ref instanceof java.lang.String)) { - com.google.protobuf.ByteString bs = - (com.google.protobuf.ByteString) ref; - java.lang.String s = bs.toStringUtf8(); - args_ = s; - return s; + public zingg.spark.connect.proto.Arguments getArgumnets() { + if (argumnetsBuilder_ == null) { + return argumnets_ == null ? zingg.spark.connect.proto.Arguments.getDefaultInstance() : argumnets_; } else { - return (java.lang.String) ref; + return argumnetsBuilder_.getMessage(); } } /** - * string args = 1 [json_name = "args"]; - * @return The bytes for args. + * .Arguments argumnets = 1 [json_name = "argumnets"]; */ - public com.google.protobuf.ByteString - getArgsBytes() { - java.lang.Object ref = args_; - if (ref instanceof String) { - com.google.protobuf.ByteString b = - com.google.protobuf.ByteString.copyFromUtf8( - (java.lang.String) ref); - args_ = b; - return b; + public Builder setArgumnets(zingg.spark.connect.proto.Arguments value) { + if (argumnetsBuilder_ == null) { + if (value == null) { + throw new NullPointerException(); + } + argumnets_ = value; } else { - return (com.google.protobuf.ByteString) ref; + argumnetsBuilder_.setMessage(value); } + bitField0_ |= 0x00000001; + onChanged(); + return this; } /** - * string args = 1 [json_name = "args"]; - * @param value The args to set. - * @return This builder for chaining. + * .Arguments argumnets = 1 [json_name = "argumnets"]; */ - public Builder setArgs( - java.lang.String value) { - if (value == null) { throw new NullPointerException(); } - args_ = value; + public Builder setArgumnets( + zingg.spark.connect.proto.Arguments.Builder builderForValue) { + if (argumnetsBuilder_ == null) { + argumnets_ = builderForValue.build(); + } else { + argumnetsBuilder_.setMessage(builderForValue.build()); + } bitField0_ |= 0x00000001; onChanged(); return this; } /** - * string args = 1 [json_name = "args"]; - * @return This builder for chaining. + * .Arguments argumnets = 1 [json_name = "argumnets"]; + */ + public Builder mergeArgumnets(zingg.spark.connect.proto.Arguments value) { + if (argumnetsBuilder_ == null) { + if (((bitField0_ & 0x00000001) != 0) && + argumnets_ != null && + argumnets_ != zingg.spark.connect.proto.Arguments.getDefaultInstance()) { + getArgumnetsBuilder().mergeFrom(value); + } else { + argumnets_ = value; + } + } else { + argumnetsBuilder_.mergeFrom(value); + } + if (argumnets_ != null) { + bitField0_ |= 0x00000001; + onChanged(); + } + return this; + } + /** + * .Arguments argumnets = 1 [json_name = "argumnets"]; */ - public Builder clearArgs() { - args_ = getDefaultInstance().getArgs(); + public Builder clearArgumnets() { bitField0_ = (bitField0_ & ~0x00000001); + argumnets_ = null; + if (argumnetsBuilder_ != null) { + argumnetsBuilder_.dispose(); + argumnetsBuilder_ = null; + } onChanged(); return this; } /** - * string args = 1 [json_name = "args"]; - * @param value The bytes for args to set. - * @return This builder for chaining. + * .Arguments argumnets = 1 [json_name = "argumnets"]; */ - public Builder setArgsBytes( - com.google.protobuf.ByteString value) { - if (value == null) { throw new NullPointerException(); } - checkByteStringIsUtf8(value); - args_ = value; + public zingg.spark.connect.proto.Arguments.Builder getArgumnetsBuilder() { bitField0_ |= 0x00000001; onChanged(); - return this; + return getArgumnetsFieldBuilder().getBuilder(); + } + /** + * .Arguments argumnets = 1 [json_name = "argumnets"]; + */ + public zingg.spark.connect.proto.ArgumentsOrBuilder getArgumnetsOrBuilder() { + if (argumnetsBuilder_ != null) { + return argumnetsBuilder_.getMessageOrBuilder(); + } else { + return argumnets_ == null ? + zingg.spark.connect.proto.Arguments.getDefaultInstance() : argumnets_; + } + } + /** + * .Arguments argumnets = 1 [json_name = "argumnets"]; + */ + private com.google.protobuf.SingleFieldBuilderV3< + zingg.spark.connect.proto.Arguments, zingg.spark.connect.proto.Arguments.Builder, zingg.spark.connect.proto.ArgumentsOrBuilder> + getArgumnetsFieldBuilder() { + if (argumnetsBuilder_ == null) { + argumnetsBuilder_ = new com.google.protobuf.SingleFieldBuilderV3< + zingg.spark.connect.proto.Arguments, zingg.spark.connect.proto.Arguments.Builder, zingg.spark.connect.proto.ArgumentsOrBuilder>( + getArgumnets(), + getParentForChildren(), + isClean()); + argumnets_ = null; + } + return argumnetsBuilder_; } - private java.lang.Object options_ = ""; + private zingg.spark.connect.proto.ClientOptions cliOptions_; + private com.google.protobuf.SingleFieldBuilderV3< + zingg.spark.connect.proto.ClientOptions, zingg.spark.connect.proto.ClientOptions.Builder, zingg.spark.connect.proto.ClientOptionsOrBuilder> cliOptionsBuilder_; /** - * string options = 2 [json_name = "options"]; - * @return The options. + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + * @return Whether the cliOptions field is set. */ - public java.lang.String getOptions() { - java.lang.Object ref = options_; - if (!(ref instanceof java.lang.String)) { - com.google.protobuf.ByteString bs = - (com.google.protobuf.ByteString) ref; - java.lang.String s = bs.toStringUtf8(); - options_ = s; - return s; + public boolean hasCliOptions() { + return ((bitField0_ & 0x00000002) != 0); + } + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + * @return The cliOptions. + */ + public zingg.spark.connect.proto.ClientOptions getCliOptions() { + if (cliOptionsBuilder_ == null) { + return cliOptions_ == null ? zingg.spark.connect.proto.ClientOptions.getDefaultInstance() : cliOptions_; } else { - return (java.lang.String) ref; + return cliOptionsBuilder_.getMessage(); } } /** - * string options = 2 [json_name = "options"]; - * @return The bytes for options. + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; */ - public com.google.protobuf.ByteString - getOptionsBytes() { - java.lang.Object ref = options_; - if (ref instanceof String) { - com.google.protobuf.ByteString b = - com.google.protobuf.ByteString.copyFromUtf8( - (java.lang.String) ref); - options_ = b; - return b; + public Builder setCliOptions(zingg.spark.connect.proto.ClientOptions value) { + if (cliOptionsBuilder_ == null) { + if (value == null) { + throw new NullPointerException(); + } + cliOptions_ = value; } else { - return (com.google.protobuf.ByteString) ref; + cliOptionsBuilder_.setMessage(value); } + bitField0_ |= 0x00000002; + onChanged(); + return this; } /** - * string options = 2 [json_name = "options"]; - * @param value The options to set. - * @return This builder for chaining. + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; */ - public Builder setOptions( - java.lang.String value) { - if (value == null) { throw new NullPointerException(); } - options_ = value; + public Builder setCliOptions( + zingg.spark.connect.proto.ClientOptions.Builder builderForValue) { + if (cliOptionsBuilder_ == null) { + cliOptions_ = builderForValue.build(); + } else { + cliOptionsBuilder_.setMessage(builderForValue.build()); + } bitField0_ |= 0x00000002; onChanged(); return this; } /** - * string options = 2 [json_name = "options"]; - * @return This builder for chaining. + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + */ + public Builder mergeCliOptions(zingg.spark.connect.proto.ClientOptions value) { + if (cliOptionsBuilder_ == null) { + if (((bitField0_ & 0x00000002) != 0) && + cliOptions_ != null && + cliOptions_ != zingg.spark.connect.proto.ClientOptions.getDefaultInstance()) { + getCliOptionsBuilder().mergeFrom(value); + } else { + cliOptions_ = value; + } + } else { + cliOptionsBuilder_.mergeFrom(value); + } + if (cliOptions_ != null) { + bitField0_ |= 0x00000002; + onChanged(); + } + return this; + } + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; */ - public Builder clearOptions() { - options_ = getDefaultInstance().getOptions(); + public Builder clearCliOptions() { bitField0_ = (bitField0_ & ~0x00000002); + cliOptions_ = null; + if (cliOptionsBuilder_ != null) { + cliOptionsBuilder_.dispose(); + cliOptionsBuilder_ = null; + } onChanged(); return this; } /** - * string options = 2 [json_name = "options"]; - * @param value The bytes for options to set. + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + */ + public zingg.spark.connect.proto.ClientOptions.Builder getCliOptionsBuilder() { + bitField0_ |= 0x00000002; + onChanged(); + return getCliOptionsFieldBuilder().getBuilder(); + } + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + */ + public zingg.spark.connect.proto.ClientOptionsOrBuilder getCliOptionsOrBuilder() { + if (cliOptionsBuilder_ != null) { + return cliOptionsBuilder_.getMessageOrBuilder(); + } else { + return cliOptions_ == null ? + zingg.spark.connect.proto.ClientOptions.getDefaultInstance() : cliOptions_; + } + } + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + */ + private com.google.protobuf.SingleFieldBuilderV3< + zingg.spark.connect.proto.ClientOptions, zingg.spark.connect.proto.ClientOptions.Builder, zingg.spark.connect.proto.ClientOptionsOrBuilder> + getCliOptionsFieldBuilder() { + if (cliOptionsBuilder_ == null) { + cliOptionsBuilder_ = new com.google.protobuf.SingleFieldBuilderV3< + zingg.spark.connect.proto.ClientOptions, zingg.spark.connect.proto.ClientOptions.Builder, zingg.spark.connect.proto.ClientOptionsOrBuilder>( + getCliOptions(), + getParentForChildren(), + isClean()); + cliOptions_ = null; + } + return cliOptionsBuilder_; + } + + private com.google.protobuf.ByteString inMemoryDate_ = com.google.protobuf.ByteString.EMPTY; + /** + *
+     * The next message is a serialized LogicalPlan
+     * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @return Whether the inMemoryDate field is set. + */ + @java.lang.Override + public boolean hasInMemoryDate() { + return ((bitField0_ & 0x00000004) != 0); + } + /** + *
+     * The next message is a serialized LogicalPlan
+     * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @return The inMemoryDate. + */ + @java.lang.Override + public com.google.protobuf.ByteString getInMemoryDate() { + return inMemoryDate_; + } + /** + *
+     * The next message is a serialized LogicalPlan
+     * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @param value The inMemoryDate to set. * @return This builder for chaining. */ - public Builder setOptionsBytes( - com.google.protobuf.ByteString value) { + public Builder setInMemoryDate(com.google.protobuf.ByteString value) { if (value == null) { throw new NullPointerException(); } - checkByteStringIsUtf8(value); - options_ = value; - bitField0_ |= 0x00000002; + inMemoryDate_ = value; + bitField0_ |= 0x00000004; + onChanged(); + return this; + } + /** + *
+     * The next message is a serialized LogicalPlan
+     * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @return This builder for chaining. + */ + public Builder clearInMemoryDate() { + bitField0_ = (bitField0_ & ~0x00000004); + inMemoryDate_ = getDefaultInstance().getInMemoryDate(); onChanged(); return this; } diff --git a/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java index a231069d6..18777abf4 100644 --- a/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java +++ b/spark/client/src/main/java/zingg/spark/connect/proto/SubmitZinggJobOrBuilder.java @@ -9,26 +9,51 @@ public interface SubmitZinggJobOrBuilder extends com.google.protobuf.MessageOrBuilder { /** - * string args = 1 [json_name = "args"]; - * @return The args. + * .Arguments argumnets = 1 [json_name = "argumnets"]; + * @return Whether the argumnets field is set. */ - java.lang.String getArgs(); + boolean hasArgumnets(); /** - * string args = 1 [json_name = "args"]; - * @return The bytes for args. + * .Arguments argumnets = 1 [json_name = "argumnets"]; + * @return The argumnets. */ - com.google.protobuf.ByteString - getArgsBytes(); + zingg.spark.connect.proto.Arguments getArgumnets(); + /** + * .Arguments argumnets = 1 [json_name = "argumnets"]; + */ + zingg.spark.connect.proto.ArgumentsOrBuilder getArgumnetsOrBuilder(); + + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + * @return Whether the cliOptions field is set. + */ + boolean hasCliOptions(); + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + * @return The cliOptions. + */ + zingg.spark.connect.proto.ClientOptions getCliOptions(); + /** + * .ClientOptions cli_options = 2 [json_name = "cliOptions"]; + */ + zingg.spark.connect.proto.ClientOptionsOrBuilder getCliOptionsOrBuilder(); /** - * string options = 2 [json_name = "options"]; - * @return The options. + *
+   * The next message is a serialized LogicalPlan
+   * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @return Whether the inMemoryDate field is set. */ - java.lang.String getOptions(); + boolean hasInMemoryDate(); /** - * string options = 2 [json_name = "options"]; - * @return The bytes for options. + *
+   * The next message is a serialized LogicalPlan
+   * 
+ * + * optional bytes in_memory_date = 3 [json_name = "inMemoryDate"]; + * @return The inMemoryDate. */ - com.google.protobuf.ByteString - getOptionsBytes(); + com.google.protobuf.ByteString getInMemoryDate(); } From aa9d8fe91e35972d02183145d71fc0c7577835ac Mon Sep 17 00:00:00 2001 From: semyonsinchenko Date: Sat, 24 Aug 2024 15:22:06 +0200 Subject: [PATCH 218/219] Merging typo --- spark/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/spark/pom.xml b/spark/pom.xml index 9c20dadeb..5e9a89a7f 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -61,7 +61,6 @@ - com.google.protobuf protobuf-java From a9466719c5f1822fe2b5195559aecd60ccaeaf82 Mon Sep 17 00:00:00 2001 From: semyonsinchenko Date: Sun, 25 Aug 2024 15:10:09 +0200 Subject: [PATCH 219/219] A new implementation On branch main Your branch is up to date with 'origin/main'. Changes to be committed: modified: common/client/src/main/java/zingg/common/client/ClientOptions.java modified: protobuf/connect_plugins.proto modified: python/zingg_v2/proto/connect_plugins_pb2.py modified: python/zingg_v2/proto/connect_plugins_pb2.pyi modified: spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java modified: spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java --- .../zingg/common/client/ClientOptions.java | 12 +- protobuf/connect_plugins.proto | 10 +- python/zingg_v2/proto/connect_plugins_pb2.py | 12 +- python/zingg_v2/proto/connect_plugins_pb2.pyi | 25 ++- .../spark/connect/ZinggConnectPlugin.java | 153 ++++++++++++------ .../spark/connect/proto/ConnectPlugins.java | 59 +++---- 6 files changed, 172 insertions(+), 99 deletions(-) diff --git a/common/client/src/main/java/zingg/common/client/ClientOptions.java b/common/client/src/main/java/zingg/common/client/ClientOptions.java index 430cbe47b..cb1aa0929 100644 --- a/common/client/src/main/java/zingg/common/client/ClientOptions.java +++ b/common/client/src/main/java/zingg/common/client/ClientOptions.java @@ -299,9 +299,13 @@ public String getOptionValue(String a) { return get(a).getValue(); //throw new IllegalArgumentException("Wrong argument"); } - - - - + /** A helper that allows to modify ClientOptions by changing values */ + public void setOptionValue(String key, String value) { + if (has(key)) { + OptionWithVal optionWithVal = get(key); + optionWithVal.setValue(value); + options.put(key, optionWithVal); + } + } } diff --git a/protobuf/connect_plugins.proto b/protobuf/connect_plugins.proto index 9e265424e..94382746b 100644 --- a/protobuf/connect_plugins.proto +++ b/protobuf/connect_plugins.proto @@ -41,7 +41,7 @@ enum DataFormat { } message FieldDefinition { - MatchType match_type = 1; + repeated MatchType match_type = 1; string data_type = 2; string field_name = 3; string fields = 4; @@ -76,10 +76,10 @@ message Arguments { } message ClientOptions { - string phase = 1; - string license = 2; - string email = 3; - string conf = 4; + optional string phase = 1; + optional string license = 2; + optional string email = 3; + optional string conf = 4; optional string preprocess = 5; optional string job_id = 6; optional string format = 7; diff --git a/python/zingg_v2/proto/connect_plugins_pb2.py b/python/zingg_v2/proto/connect_plugins_pb2.py index 8543fde20..18cdd505a 100644 --- a/python/zingg_v2/proto/connect_plugins_pb2.py +++ b/python/zingg_v2/proto/connect_plugins_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63onnect_plugins.proto\"\xa9\x01\n\x0eSubmitZinggJob\x12(\n\targumnets\x18\x01 \x01(\x0b\x32\n.ArgumentsR\targumnets\x12/\n\x0b\x63li_options\x18\x02 \x01(\x0b\x32\x0e.ClientOptionsR\ncliOptions\x12)\n\x0ein_memory_date\x18\x03 \x01(\x0cH\x00R\x0cinMemoryDate\x88\x01\x01\x42\x11\n\x0f_in_memory_date\"\x80\x02\n\x0f\x46ieldDefinition\x12)\n\nmatch_type\x18\x01 \x01(\x0e\x32\n.MatchTypeR\tmatchType\x12\x1b\n\tdata_type\x18\x02 \x01(\tR\x08\x64\x61taType\x12\x1d\n\nfield_name\x18\x03 \x01(\tR\tfieldName\x12\x16\n\x06\x66ields\x18\x04 \x01(\tR\x06\x66ields\x12\"\n\nstop_words\x18\x05 \x01(\tH\x00R\tstopWords\x88\x01\x01\x12)\n\rabbreviations\x18\x06 \x01(\tH\x01R\rabbreviations\x88\x01\x01\x42\r\n\x0b_stop_wordsB\x10\n\x0e_abbreviations\"\xfc\x01\n\x04Pipe\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12#\n\x06\x66ormat\x18\x02 \x01(\x0e\x32\x0b.DataFormatR\x06\x66ormat\x12&\n\x05props\x18\x03 \x03(\x0b\x32\x10.Pipe.PropsEntryR\x05props\x12&\n\x0cschema_field\x18\x04 \x01(\tH\x00R\x0bschemaField\x88\x01\x01\x12\x17\n\x04mode\x18\x05 \x01(\tH\x01R\x04mode\x88\x01\x01\x1a\x38\n\nPropsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0f\n\r_schema_fieldB\x07\n\x05_mode\"\xbe\x04\n\tArguments\x12\x1d\n\x06output\x18\x01 \x03(\x0b\x32\x05.PipeR\x06output\x12\x19\n\x04\x64\x61ta\x18\x02 \x03(\x0b\x32\x05.PipeR\x04\x64\x61ta\x12\x1b\n\tzingg_dir\x18\x03 \x01(\tR\x08zinggDir\x12\x30\n\x10training_samples\x18\x04 \x03(\x0b\x32\x05.PipeR\x0ftrainingSamples\x12=\n\x11\x66iield_definition\x18\x05 \x03(\x0b\x32\x10.FieldDefinitionR\x10\x66iieldDefinition\x12%\n\x0enum_partitions\x18\x06 \x01(\x05R\rnumPartitions\x12\x33\n\x16label_data_sample_size\x18\x07 \x01(\x02R\x13labelDataSampleSize\x12\x19\n\x08model_id\x18\x08 \x01(\tR\x07modelId\x12\x1c\n\tthreshold\x18\t \x01(\x02R\tthreshold\x12\x15\n\x06job_id\x18\n \x01(\x05R\x05jobId\x12\'\n\x0f\x63ollect_metrics\x18\x0b \x01(\x08R\x0e\x63ollectMetrics\x12!\n\x0cshow_concise\x18\x0c \x01(\x08R\x0bshowConcise\x12*\n\x11stop_words_cutoff\x18\r \x01(\x02R\x0fstopWordsCutoff\x12\x1d\n\nblock_size\x18\x0e \x01(\x05R\tblockSize\x12\x1b\n\x06\x63olumn\x18\x0f \x01(\tH\x00R\x06\x63olumn\x88\x01\x01\x42\t\n\x07_column\"\xc2\x04\n\rClientOptions\x12\x14\n\x05phase\x18\x01 \x01(\tR\x05phase\x12\x18\n\x07license\x18\x02 \x01(\tR\x07license\x12\x14\n\x05\x65mail\x18\x03 \x01(\tR\x05\x65mail\x12\x12\n\x04\x63onf\x18\x04 \x01(\tR\x04\x63onf\x12#\n\npreprocess\x18\x05 \x01(\tH\x00R\npreprocess\x88\x01\x01\x12\x1a\n\x06job_id\x18\x06 \x01(\tH\x01R\x05jobId\x88\x01\x01\x12\x1b\n\x06\x66ormat\x18\x07 \x01(\tH\x02R\x06\x66ormat\x88\x01\x01\x12 \n\tzingg_dir\x18\x08 \x01(\tH\x03R\x08zinggDir\x88\x01\x01\x12\x1e\n\x08model_id\x18\t \x01(\tH\x04R\x07modelId\x88\x01\x01\x12,\n\x0f\x63ollect_metrics\x18\n \x01(\tH\x05R\x0e\x63ollectMetrics\x88\x01\x01\x12&\n\x0cshow_concise\x18\x0b \x01(\tH\x06R\x0bshowConcise\x88\x01\x01\x12\x1f\n\x08location\x18\x0c \x01(\tH\x07R\x08location\x88\x01\x01\x12\x1b\n\x06\x63olumn\x18\r \x01(\tH\x08R\x06\x63olumn\x88\x01\x01\x12\x1b\n\x06remote\x18\x0e \x01(\tH\tR\x06remote\x88\x01\x01\x42\r\n\x0b_preprocessB\t\n\x07_job_idB\t\n\x07_formatB\x0c\n\n_zingg_dirB\x0b\n\t_model_idB\x12\n\x10_collect_metricsB\x0f\n\r_show_conciseB\x0b\n\t_locationB\t\n\x07_columnB\t\n\x07_remote*\xde\x01\n\tMatchType\x12\x0c\n\x08MT_FUZZY\x10\x00\x12\x0c\n\x08MT_EXACT\x10\x01\x12\x0f\n\x0bMT_DONT_USE\x10\x02\x12\x0c\n\x08MT_EMAIL\x10\x03\x12\x0e\n\nMT_PINCODE\x10\x04\x12\x14\n\x10MT_NULL_OR_BLANK\x10\x05\x12\x0b\n\x07MT_TEXT\x10\x06\x12\x0e\n\nMT_NUMERIC\x10\x07\x12\x19\n\x15MT_NUMERIC_WITH_UNITS\x10\x08\x12\x1b\n\x17MT_ONLY_ALPHABETS_EXACT\x10\t\x12\x1b\n\x17MT_ONLY_ALPHABETS_FUZZY\x10\n*\xcc\x01\n\nDataFormat\x12\n\n\x06\x44\x46_CSV\x10\x00\x12\x0e\n\nDF_PARQUET\x10\x01\x12\x0b\n\x07\x44\x46_JSON\x10\x02\x12\x0b\n\x07\x44\x46_TEXT\x10\x03\x12\n\n\x06\x44\x46_XLS\x10\x04\x12\x0b\n\x07\x44\x46_AVRO\x10\x05\x12\x0b\n\x07\x44\x46_JDBC\x10\x06\x12\x10\n\x0c\x44\x46_CASSANDRA\x10\x07\x12\x10\n\x0c\x44\x46_SNOWFLAKE\x10\x08\x12\x0e\n\nDF_ELASTIC\x10\t\x12\r\n\tDF_EXACOL\x10\n\x12\x0e\n\nDF_BIGQUEY\x10\x0b\x12\x0f\n\x0b\x44\x46_INMEMORY\x10\x0c\x42\x1d\n\x19zingg.spark.connect.protoP\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63onnect_plugins.proto\"\xa9\x01\n\x0eSubmitZinggJob\x12(\n\targumnets\x18\x01 \x01(\x0b\x32\n.ArgumentsR\targumnets\x12/\n\x0b\x63li_options\x18\x02 \x01(\x0b\x32\x0e.ClientOptionsR\ncliOptions\x12)\n\x0ein_memory_date\x18\x03 \x01(\x0cH\x00R\x0cinMemoryDate\x88\x01\x01\x42\x11\n\x0f_in_memory_date\"\x80\x02\n\x0f\x46ieldDefinition\x12)\n\nmatch_type\x18\x01 \x03(\x0e\x32\n.MatchTypeR\tmatchType\x12\x1b\n\tdata_type\x18\x02 \x01(\tR\x08\x64\x61taType\x12\x1d\n\nfield_name\x18\x03 \x01(\tR\tfieldName\x12\x16\n\x06\x66ields\x18\x04 \x01(\tR\x06\x66ields\x12\"\n\nstop_words\x18\x05 \x01(\tH\x00R\tstopWords\x88\x01\x01\x12)\n\rabbreviations\x18\x06 \x01(\tH\x01R\rabbreviations\x88\x01\x01\x42\r\n\x0b_stop_wordsB\x10\n\x0e_abbreviations\"\xfc\x01\n\x04Pipe\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12#\n\x06\x66ormat\x18\x02 \x01(\x0e\x32\x0b.DataFormatR\x06\x66ormat\x12&\n\x05props\x18\x03 \x03(\x0b\x32\x10.Pipe.PropsEntryR\x05props\x12&\n\x0cschema_field\x18\x04 \x01(\tH\x00R\x0bschemaField\x88\x01\x01\x12\x17\n\x04mode\x18\x05 \x01(\tH\x01R\x04mode\x88\x01\x01\x1a\x38\n\nPropsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0f\n\r_schema_fieldB\x07\n\x05_mode\"\xbe\x04\n\tArguments\x12\x1d\n\x06output\x18\x01 \x03(\x0b\x32\x05.PipeR\x06output\x12\x19\n\x04\x64\x61ta\x18\x02 \x03(\x0b\x32\x05.PipeR\x04\x64\x61ta\x12\x1b\n\tzingg_dir\x18\x03 \x01(\tR\x08zinggDir\x12\x30\n\x10training_samples\x18\x04 \x03(\x0b\x32\x05.PipeR\x0ftrainingSamples\x12=\n\x11\x66iield_definition\x18\x05 \x03(\x0b\x32\x10.FieldDefinitionR\x10\x66iieldDefinition\x12%\n\x0enum_partitions\x18\x06 \x01(\x05R\rnumPartitions\x12\x33\n\x16label_data_sample_size\x18\x07 \x01(\x02R\x13labelDataSampleSize\x12\x19\n\x08model_id\x18\x08 \x01(\tR\x07modelId\x12\x1c\n\tthreshold\x18\t \x01(\x02R\tthreshold\x12\x15\n\x06job_id\x18\n \x01(\x05R\x05jobId\x12\'\n\x0f\x63ollect_metrics\x18\x0b \x01(\x08R\x0e\x63ollectMetrics\x12!\n\x0cshow_concise\x18\x0c \x01(\x08R\x0bshowConcise\x12*\n\x11stop_words_cutoff\x18\r \x01(\x02R\x0fstopWordsCutoff\x12\x1d\n\nblock_size\x18\x0e \x01(\x03R\tblockSize\x12\x1b\n\x06\x63olumn\x18\x0f \x01(\tH\x00R\x06\x63olumn\x88\x01\x01\x42\t\n\x07_column\"\xff\x04\n\rClientOptions\x12\x19\n\x05phase\x18\x01 \x01(\tH\x00R\x05phase\x88\x01\x01\x12\x1d\n\x07license\x18\x02 \x01(\tH\x01R\x07license\x88\x01\x01\x12\x19\n\x05\x65mail\x18\x03 \x01(\tH\x02R\x05\x65mail\x88\x01\x01\x12\x17\n\x04\x63onf\x18\x04 \x01(\tH\x03R\x04\x63onf\x88\x01\x01\x12#\n\npreprocess\x18\x05 \x01(\tH\x04R\npreprocess\x88\x01\x01\x12\x1a\n\x06job_id\x18\x06 \x01(\tH\x05R\x05jobId\x88\x01\x01\x12\x1b\n\x06\x66ormat\x18\x07 \x01(\tH\x06R\x06\x66ormat\x88\x01\x01\x12 \n\tzingg_dir\x18\x08 \x01(\tH\x07R\x08zinggDir\x88\x01\x01\x12\x1e\n\x08model_id\x18\t \x01(\tH\x08R\x07modelId\x88\x01\x01\x12,\n\x0f\x63ollect_metrics\x18\n \x01(\tH\tR\x0e\x63ollectMetrics\x88\x01\x01\x12&\n\x0cshow_concise\x18\x0b \x01(\tH\nR\x0bshowConcise\x88\x01\x01\x12\x1f\n\x08location\x18\x0c \x01(\tH\x0bR\x08location\x88\x01\x01\x12\x1b\n\x06\x63olumn\x18\r \x01(\tH\x0cR\x06\x63olumn\x88\x01\x01\x12\x1b\n\x06remote\x18\x0e \x01(\tH\rR\x06remote\x88\x01\x01\x42\x08\n\x06_phaseB\n\n\x08_licenseB\x08\n\x06_emailB\x07\n\x05_confB\r\n\x0b_preprocessB\t\n\x07_job_idB\t\n\x07_formatB\x0c\n\n_zingg_dirB\x0b\n\t_model_idB\x12\n\x10_collect_metricsB\x0f\n\r_show_conciseB\x0b\n\t_locationB\t\n\x07_columnB\t\n\x07_remote*\xde\x01\n\tMatchType\x12\x0c\n\x08MT_FUZZY\x10\x00\x12\x0c\n\x08MT_EXACT\x10\x01\x12\x0f\n\x0bMT_DONT_USE\x10\x02\x12\x0c\n\x08MT_EMAIL\x10\x03\x12\x0e\n\nMT_PINCODE\x10\x04\x12\x14\n\x10MT_NULL_OR_BLANK\x10\x05\x12\x0b\n\x07MT_TEXT\x10\x06\x12\x0e\n\nMT_NUMERIC\x10\x07\x12\x19\n\x15MT_NUMERIC_WITH_UNITS\x10\x08\x12\x1b\n\x17MT_ONLY_ALPHABETS_EXACT\x10\t\x12\x1b\n\x17MT_ONLY_ALPHABETS_FUZZY\x10\n*\xcc\x01\n\nDataFormat\x12\n\n\x06\x44\x46_CSV\x10\x00\x12\x0e\n\nDF_PARQUET\x10\x01\x12\x0b\n\x07\x44\x46_JSON\x10\x02\x12\x0b\n\x07\x44\x46_TEXT\x10\x03\x12\n\n\x06\x44\x46_XLS\x10\x04\x12\x0b\n\x07\x44\x46_AVRO\x10\x05\x12\x0b\n\x07\x44\x46_JDBC\x10\x06\x12\x10\n\x0c\x44\x46_CASSANDRA\x10\x07\x12\x10\n\x0c\x44\x46_SNOWFLAKE\x10\x08\x12\x0e\n\nDF_ELASTIC\x10\t\x12\r\n\tDF_EXACOL\x10\n\x12\x0e\n\nDF_BIGQUEY\x10\x0b\x12\x0f\n\x0b\x44\x46_INMEMORY\x10\x0c\x42\x1d\n\x19zingg.spark.connect.protoP\x01\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -24,10 +24,10 @@ _globals['DESCRIPTOR']._serialized_options = b'\n\031zingg.spark.connect.protoP\001' _globals['_PIPE_PROPSENTRY']._options = None _globals['_PIPE_PROPSENTRY']._serialized_options = b'8\001' - _globals['_MATCHTYPE']._serialized_start=1870 - _globals['_MATCHTYPE']._serialized_end=2092 - _globals['_DATAFORMAT']._serialized_start=2095 - _globals['_DATAFORMAT']._serialized_end=2299 + _globals['_MATCHTYPE']._serialized_start=1931 + _globals['_MATCHTYPE']._serialized_end=2153 + _globals['_DATAFORMAT']._serialized_start=2156 + _globals['_DATAFORMAT']._serialized_end=2360 _globals['_SUBMITZINGGJOB']._serialized_start=26 _globals['_SUBMITZINGGJOB']._serialized_end=195 _globals['_FIELDDEFINITION']._serialized_start=198 @@ -39,5 +39,5 @@ _globals['_ARGUMENTS']._serialized_start=712 _globals['_ARGUMENTS']._serialized_end=1286 _globals['_CLIENTOPTIONS']._serialized_start=1289 - _globals['_CLIENTOPTIONS']._serialized_end=1867 + _globals['_CLIENTOPTIONS']._serialized_end=1928 # @@protoc_insertion_point(module_scope) diff --git a/python/zingg_v2/proto/connect_plugins_pb2.pyi b/python/zingg_v2/proto/connect_plugins_pb2.pyi index cd4ca8c65..b42895889 100644 --- a/python/zingg_v2/proto/connect_plugins_pb2.pyi +++ b/python/zingg_v2/proto/connect_plugins_pb2.pyi @@ -124,7 +124,8 @@ class FieldDefinition(google.protobuf.message.Message): FIELDS_FIELD_NUMBER: builtins.int STOP_WORDS_FIELD_NUMBER: builtins.int ABBREVIATIONS_FIELD_NUMBER: builtins.int - match_type: global___MatchType.ValueType + @property + def match_type(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[global___MatchType.ValueType]: ... data_type: builtins.str field_name: builtins.str fields: builtins.str @@ -133,7 +134,7 @@ class FieldDefinition(google.protobuf.message.Message): def __init__( self, *, - match_type: global___MatchType.ValueType = ..., + match_type: collections.abc.Iterable[global___MatchType.ValueType] | None = ..., data_type: builtins.str = ..., field_name: builtins.str = ..., fields: builtins.str = ..., @@ -296,10 +297,10 @@ class ClientOptions(google.protobuf.message.Message): def __init__( self, *, - phase: builtins.str = ..., - license: builtins.str = ..., - email: builtins.str = ..., - conf: builtins.str = ..., + phase: builtins.str | None = ..., + license: builtins.str | None = ..., + email: builtins.str | None = ..., + conf: builtins.str | None = ..., preprocess: builtins.str | None = ..., job_id: builtins.str | None = ..., format: builtins.str | None = ..., @@ -311,21 +312,29 @@ class ClientOptions(google.protobuf.message.Message): column: builtins.str | None = ..., remote: builtins.str | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["_collect_metrics", b"_collect_metrics", "_column", b"_column", "_format", b"_format", "_job_id", b"_job_id", "_location", b"_location", "_model_id", b"_model_id", "_preprocess", b"_preprocess", "_remote", b"_remote", "_show_concise", b"_show_concise", "_zingg_dir", b"_zingg_dir", "collect_metrics", b"collect_metrics", "column", b"column", "format", b"format", "job_id", b"job_id", "location", b"location", "model_id", b"model_id", "preprocess", b"preprocess", "remote", b"remote", "show_concise", b"show_concise", "zingg_dir", b"zingg_dir"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["_collect_metrics", b"_collect_metrics", "_column", b"_column", "_format", b"_format", "_job_id", b"_job_id", "_location", b"_location", "_model_id", b"_model_id", "_preprocess", b"_preprocess", "_remote", b"_remote", "_show_concise", b"_show_concise", "_zingg_dir", b"_zingg_dir", "collect_metrics", b"collect_metrics", "column", b"column", "conf", b"conf", "email", b"email", "format", b"format", "job_id", b"job_id", "license", b"license", "location", b"location", "model_id", b"model_id", "phase", b"phase", "preprocess", b"preprocess", "remote", b"remote", "show_concise", b"show_concise", "zingg_dir", b"zingg_dir"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_collect_metrics", b"_collect_metrics", "_column", b"_column", "_conf", b"_conf", "_email", b"_email", "_format", b"_format", "_job_id", b"_job_id", "_license", b"_license", "_location", b"_location", "_model_id", b"_model_id", "_phase", b"_phase", "_preprocess", b"_preprocess", "_remote", b"_remote", "_show_concise", b"_show_concise", "_zingg_dir", b"_zingg_dir", "collect_metrics", b"collect_metrics", "column", b"column", "conf", b"conf", "email", b"email", "format", b"format", "job_id", b"job_id", "license", b"license", "location", b"location", "model_id", b"model_id", "phase", b"phase", "preprocess", b"preprocess", "remote", b"remote", "show_concise", b"show_concise", "zingg_dir", b"zingg_dir"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_collect_metrics", b"_collect_metrics", "_column", b"_column", "_conf", b"_conf", "_email", b"_email", "_format", b"_format", "_job_id", b"_job_id", "_license", b"_license", "_location", b"_location", "_model_id", b"_model_id", "_phase", b"_phase", "_preprocess", b"_preprocess", "_remote", b"_remote", "_show_concise", b"_show_concise", "_zingg_dir", b"_zingg_dir", "collect_metrics", b"collect_metrics", "column", b"column", "conf", b"conf", "email", b"email", "format", b"format", "job_id", b"job_id", "license", b"license", "location", b"location", "model_id", b"model_id", "phase", b"phase", "preprocess", b"preprocess", "remote", b"remote", "show_concise", b"show_concise", "zingg_dir", b"zingg_dir"]) -> None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_collect_metrics", b"_collect_metrics"]) -> typing_extensions.Literal["collect_metrics"] | None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_column", b"_column"]) -> typing_extensions.Literal["column"] | None: ... @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_conf", b"_conf"]) -> typing_extensions.Literal["conf"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_email", b"_email"]) -> typing_extensions.Literal["email"] | None: ... + @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_format", b"_format"]) -> typing_extensions.Literal["format"] | None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_job_id", b"_job_id"]) -> typing_extensions.Literal["job_id"] | None: ... @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_license", b"_license"]) -> typing_extensions.Literal["license"] | None: ... + @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_location", b"_location"]) -> typing_extensions.Literal["location"] | None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_model_id", b"_model_id"]) -> typing_extensions.Literal["model_id"] | None: ... @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_phase", b"_phase"]) -> typing_extensions.Literal["phase"] | None: ... + @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_preprocess", b"_preprocess"]) -> typing_extensions.Literal["preprocess"] | None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_remote", b"_remote"]) -> typing_extensions.Literal["remote"] | None: ... diff --git a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java index 1463f12b0..5a60df46c 100644 --- a/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java +++ b/spark/client/src/main/java/zingg/spark/connect/ZinggConnectPlugin.java @@ -1,7 +1,10 @@ package zingg.spark.connect; +import java.util.List; +import java.util.Map; + import com.google.protobuf.Any; -import com.google.protobuf.InvalidProtocolBufferException; + import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -14,19 +17,16 @@ import org.apache.spark.sql.types.StructType; import scala.Option; -import zingg.common.client.*; import zingg.common.client.Arguments; +import zingg.common.client.ArgumentsUtil; import zingg.common.client.ClientOptions; -import zingg.spark.connect.*; -import zingg.spark.connect.proto.*; +import zingg.common.client.FieldDefinition; +import zingg.common.client.IArguments; import zingg.common.client.pipe.Pipe; -import zingg.spark.client.SparkClient; import zingg.spark.client.pipe.SparkPipe; - -import java.util.Map; -import java.util.Optional; -import java.util.HashMap; -import java.util.List; +import zingg.spark.connect.proto.DataFormat; +import zingg.spark.connect.proto.MatchType; +import zingg.spark.connect.proto.SubmitZinggJob; public class ZinggConnectPlugin implements RelationPlugin { private SparkPipe parsePipe(zingg.spark.connect.proto.Pipe protoPipe) { @@ -85,6 +85,49 @@ private SparkPipe[] parsePipes(List protoPipes) return protoPipes.stream().map(protoPipe -> parsePipe(protoPipe)).toArray(SparkPipe[]::new); } + private FieldDefinition parseFieldDefinition(zingg.spark.connect.proto.FieldDefinition fieldDefinitionProto) { + FieldDefinition fieldDefinition = new FieldDefinition(); + fieldDefinition.setMatchType(fieldDefinitionProto.getMatchTypeList().stream().map(mt -> { + if (mt == MatchType.MT_FUZZY) { + return zingg.common.client.MatchType.FUZZY; + } else if (mt == MatchType.MT_EXACT) { + return zingg.common.client.MatchType.EXACT; + } else if (mt == MatchType.MT_DONT_USE) { + return zingg.common.client.MatchType.DONT_USE; + } else if (mt == MatchType.MT_EMAIL) { + return zingg.common.client.MatchType.EMAIL; + } else if (mt == MatchType.MT_PINCODE) { + return zingg.common.client.MatchType.PINCODE; + } else if (mt == MatchType.MT_NULL_OR_BLANK) { + return zingg.common.client.MatchType.NULL_OR_BLANK; + } else if (mt == MatchType.MT_TEXT) { + return zingg.common.client.MatchType.TEXT; + } else if (mt == MatchType.MT_NUMERIC) { + return zingg.common.client.MatchType.NUMERIC; + } else if (mt == MatchType.MT_NUMERIC_WITH_UNITS) { + return zingg.common.client.MatchType.NUMERIC_WITH_UNITS; + } else if (mt == MatchType.MT_ONLY_ALPHABETS_EXACT) { + return zingg.common.client.MatchType.ONLY_ALPHABETS_EXACT; + } else if (mt == MatchType.MT_ONLY_ALPHABETS_FUZZY) { + return zingg.common.client.MatchType.ONLY_ALPHABETS_FUZZY; + } else { + throw new RuntimeException(String.format("Unknown type %s", mt.name())); + } + }).toList()); + + fieldDefinition.setDataType(fieldDefinitionProto.getDataType()); + fieldDefinition.setFieldName(fieldDefinitionProto.getFieldName()); + fieldDefinition.setFields(fieldDefinitionProto.getFields()); + if (fieldDefinitionProto.hasStopWords()) { + fieldDefinition.setStopWords(fieldDefinitionProto.getStopWords()); + } + if (fieldDefinitionProto.hasAbbreviations()) { + fieldDefinition.setAbbreviations(fieldDefinitionProto.getAbbreviations()); + } + + return fieldDefinition; + } + // 3.5.2 behaviour // Because of shading rules this method may be marked as wrongly overriden @Override @@ -102,6 +145,9 @@ public Option transform(Any relation, SparkConnectPlanner planner) arguments.setData(parsePipes(zinggJobProto.getArgumnets().getDataList())); // Training samples arguments.setTrainingSamples(parsePipes(zinggJobProto.getArgumnets().getTrainingSamplesList())); + // Field definitions + arguments.setFieldDefinition(zinggJobProto.getArgumnets().getFiieldDefinitionList().stream() + .map(fd -> parseFieldDefinition(fd)).toList()); // Arguments arguments.setZinggDir(zinggJobProto.getArgumnets().getZinggDir()); @@ -120,44 +166,57 @@ public Option transform(Any relation, SparkConnectPlanner planner) // Options zingg.spark.connect.proto.ClientOptions clientOptionsProto = zinggJobProto.getCliOptions(); - } - } + ClientOptions clientOptions = new ClientOptions(); - public Optional transform(byte[] bytes, SparkConnectPlanner sparkConnectPlanner) { - Any command; - try { - command = Any.parseFrom(bytes); - if (!command.is(SubmitZinggJob.class)) { - return Optional.empty(); - } else { - try (SparkSession session = sparkConnectPlanner.sessionHolder().session()) { - SubmitZinggJob request = command.unpack(SubmitZinggJob.class); - String options = request.getOptions(); - String args = request.getArgs(); - ClientOptions clientOptions = new ClientOptions(options); - IArguments arguments = new ArgumentsUtil() - .createArgumentsFromJSONString(args, clientOptions.getOptionValue(ClientOptions.PHASE)); - SparkClient client = new SparkClient(arguments, clientOptions, session); - client.init(); - client.execute(); - client.postMetrics(); - - Dataset outDF = session.createDataFrame( - List.of( - RowFactory.create( - "SUCCESS", - new ArgumentsUtil().writeArgumentstoJSONString(client.getArguments()))), - new StructType(new StructField[] { - DataTypes.createStructField("status", DataTypes.StringType, false), - DataTypes.createStructField("newArgs", DataTypes.StringType, false) - })); - return Optional.of(outDF.logicalPlan()); - } - } - } catch (InvalidProtocolBufferException e) { - throw new RuntimeException("Protobuf exception in SparkConnect", e); - } catch (ZinggClientException e) { - throw new RuntimeException("Zingg Internal Error", e); + if (clientOptionsProto.hasPhase()) { + clientOptions.setOptionValue(ClientOptions.PHASE, clientOptionsProto.getPhase()); + } + if (clientOptionsProto.hasLicense()) { + clientOptions.setOptionValue(ClientOptions.LICENSE, clientOptionsProto.getLicense()); + } + if (clientOptionsProto.hasEmail()) { + clientOptions.setOptionValue(ClientOptions.EMAIL, clientOptionsProto.getEmail()); + } + if (clientOptionsProto.hasConf()) { + clientOptions.setOptionValue(ClientOptions.CONF, clientOptionsProto.getConf()); + } + if (clientOptionsProto.hasPreprocess()) { + clientOptions.setOptionValue(ClientOptions.PREPROCESS, clientOptionsProto.getPreprocess()); + } + if (clientOptionsProto.hasJobId()) { + clientOptions.setOptionValue(ClientOptions.JOBID, clientOptionsProto.getJobId()); + } + if (clientOptionsProto.hasFormat()) { + clientOptions.setOptionValue(ClientOptions.FORMAT, clientOptionsProto.getFormat()); + } + if (clientOptionsProto.hasZinggDir()) { + clientOptions.setOptionValue(ClientOptions.ZINGG_DIR, clientOptionsProto.getZinggDir()); + } + if (clientOptionsProto.hasModelId()) { + clientOptions.setOptionValue(ClientOptions.MODEL_ID, clientOptionsProto.getModelId()); + } + if (clientOptionsProto.hasCollectMetrics()) { + clientOptions.setOptionValue(ClientOptions.COLLECT_METRICS, clientOptionsProto.getCollectMetrics()); + } + if (clientOptionsProto.hasShowConcise()) { + clientOptions.setOptionValue(ClientOptions.SHOW_CONCISE, clientOptionsProto.getShowConcise()); + } + if (clientOptionsProto.hasLocation()) { + clientOptions.setOptionValue(ClientOptions.LOCATION, clientOptionsProto.getLocation()); + } + if (clientOptionsProto.hasColumn()) { + clientOptions.setOptionValue(ClientOptions.COLUMN, clientOptionsProto.getColumn()); + } + if (clientOptionsProto.hasRemote()) { + clientOptions.setOptionValue(ClientOptions.REMOTE, clientOptionsProto.getRemote()); + } + + Dataset outDF = spark.createDataFrame( + List.of(RowFactory.create(new ArgumentsUtil().writeArgumentstoJSONString(arguments), + String.join(" ", clientOptions.getCommandLineArgs()))), + new StructType(new StructField[] { DataTypes.createStructField("args", DataTypes.StringType, false), + DataTypes.createStructField("cliopts", DataTypes.StringType, false) })); + return Option.apply(outDF.logicalPlan()); } } } diff --git a/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java b/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java index d7eddb452..07d04e315 100644 --- a/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java +++ b/spark/client/src/main/java/zingg/spark/connect/proto/ConnectPlugins.java @@ -59,7 +59,7 @@ public static void registerAllExtensions( "nets\022/\n\013cli_options\030\002 \001(\0132\016.ClientOption" + "sR\ncliOptions\022)\n\016in_memory_date\030\003 \001(\014H\000R" + "\014inMemoryDate\210\001\001B\021\n\017_in_memory_date\"\200\002\n\017" + - "FieldDefinition\022)\n\nmatch_type\030\001 \001(\0162\n.Ma" + + "FieldDefinition\022)\n\nmatch_type\030\001 \003(\0162\n.Ma" + "tchTypeR\tmatchType\022\033\n\tdata_type\030\002 \001(\tR\010d" + "ataType\022\035\n\nfield_name\030\003 \001(\tR\tfieldName\022\026" + "\n\006fields\030\004 \001(\tR\006fields\022\"\n\nstop_words\030\005 \001" + @@ -84,35 +84,36 @@ public static void registerAllExtensions( "id\030\n \001(\005R\005jobId\022\'\n\017collect_metrics\030\013 \001(\010" + "R\016collectMetrics\022!\n\014show_concise\030\014 \001(\010R\013" + "showConcise\022*\n\021stop_words_cutoff\030\r \001(\002R\017" + - "stopWordsCutoff\022\035\n\nblock_size\030\016 \001(\005R\tblo" + + "stopWordsCutoff\022\035\n\nblock_size\030\016 \001(\003R\tblo" + "ckSize\022\033\n\006column\030\017 \001(\tH\000R\006column\210\001\001B\t\n\007_" + - "column\"\302\004\n\rClientOptions\022\024\n\005phase\030\001 \001(\tR" + - "\005phase\022\030\n\007license\030\002 \001(\tR\007license\022\024\n\005emai" + - "l\030\003 \001(\tR\005email\022\022\n\004conf\030\004 \001(\tR\004conf\022#\n\npr" + - "eprocess\030\005 \001(\tH\000R\npreprocess\210\001\001\022\032\n\006job_i" + - "d\030\006 \001(\tH\001R\005jobId\210\001\001\022\033\n\006format\030\007 \001(\tH\002R\006f" + - "ormat\210\001\001\022 \n\tzingg_dir\030\010 \001(\tH\003R\010zinggDir\210" + - "\001\001\022\036\n\010model_id\030\t \001(\tH\004R\007modelId\210\001\001\022,\n\017co" + - "llect_metrics\030\n \001(\tH\005R\016collectMetrics\210\001\001" + - "\022&\n\014show_concise\030\013 \001(\tH\006R\013showConcise\210\001\001" + - "\022\037\n\010location\030\014 \001(\tH\007R\010location\210\001\001\022\033\n\006col" + - "umn\030\r \001(\tH\010R\006column\210\001\001\022\033\n\006remote\030\016 \001(\tH\t" + - "R\006remote\210\001\001B\r\n\013_preprocessB\t\n\007_job_idB\t\n" + - "\007_formatB\014\n\n_zingg_dirB\013\n\t_model_idB\022\n\020_" + - "collect_metricsB\017\n\r_show_conciseB\013\n\t_loc" + - "ationB\t\n\007_columnB\t\n\007_remote*\336\001\n\tMatchTyp" + - "e\022\014\n\010MT_FUZZY\020\000\022\014\n\010MT_EXACT\020\001\022\017\n\013MT_DONT" + - "_USE\020\002\022\014\n\010MT_EMAIL\020\003\022\016\n\nMT_PINCODE\020\004\022\024\n\020" + - "MT_NULL_OR_BLANK\020\005\022\013\n\007MT_TEXT\020\006\022\016\n\nMT_NU" + - "MERIC\020\007\022\031\n\025MT_NUMERIC_WITH_UNITS\020\010\022\033\n\027MT" + - "_ONLY_ALPHABETS_EXACT\020\t\022\033\n\027MT_ONLY_ALPHA" + - "BETS_FUZZY\020\n*\314\001\n\nDataFormat\022\n\n\006DF_CSV\020\000\022" + - "\016\n\nDF_PARQUET\020\001\022\013\n\007DF_JSON\020\002\022\013\n\007DF_TEXT\020" + - "\003\022\n\n\006DF_XLS\020\004\022\013\n\007DF_AVRO\020\005\022\013\n\007DF_JDBC\020\006\022" + - "\020\n\014DF_CASSANDRA\020\007\022\020\n\014DF_SNOWFLAKE\020\010\022\016\n\nD" + - "F_ELASTIC\020\t\022\r\n\tDF_EXACOL\020\n\022\016\n\nDF_BIGQUEY" + - "\020\013\022\017\n\013DF_INMEMORY\020\014B\035\n\031zingg.spark.conne" + - "ct.protoP\001b\006proto3" + "column\"\377\004\n\rClientOptions\022\031\n\005phase\030\001 \001(\tH" + + "\000R\005phase\210\001\001\022\035\n\007license\030\002 \001(\tH\001R\007license\210" + + "\001\001\022\031\n\005email\030\003 \001(\tH\002R\005email\210\001\001\022\027\n\004conf\030\004 " + + "\001(\tH\003R\004conf\210\001\001\022#\n\npreprocess\030\005 \001(\tH\004R\npr" + + "eprocess\210\001\001\022\032\n\006job_id\030\006 \001(\tH\005R\005jobId\210\001\001\022" + + "\033\n\006format\030\007 \001(\tH\006R\006format\210\001\001\022 \n\tzingg_di" + + "r\030\010 \001(\tH\007R\010zinggDir\210\001\001\022\036\n\010model_id\030\t \001(\t" + + "H\010R\007modelId\210\001\001\022,\n\017collect_metrics\030\n \001(\tH" + + "\tR\016collectMetrics\210\001\001\022&\n\014show_concise\030\013 \001" + + "(\tH\nR\013showConcise\210\001\001\022\037\n\010location\030\014 \001(\tH\013" + + "R\010location\210\001\001\022\033\n\006column\030\r \001(\tH\014R\006column\210" + + "\001\001\022\033\n\006remote\030\016 \001(\tH\rR\006remote\210\001\001B\010\n\006_phas" + + "eB\n\n\010_licenseB\010\n\006_emailB\007\n\005_confB\r\n\013_pre" + + "processB\t\n\007_job_idB\t\n\007_formatB\014\n\n_zingg_" + + "dirB\013\n\t_model_idB\022\n\020_collect_metricsB\017\n\r" + + "_show_conciseB\013\n\t_locationB\t\n\007_columnB\t\n" + + "\007_remote*\336\001\n\tMatchType\022\014\n\010MT_FUZZY\020\000\022\014\n\010" + + "MT_EXACT\020\001\022\017\n\013MT_DONT_USE\020\002\022\014\n\010MT_EMAIL\020" + + "\003\022\016\n\nMT_PINCODE\020\004\022\024\n\020MT_NULL_OR_BLANK\020\005\022" + + "\013\n\007MT_TEXT\020\006\022\016\n\nMT_NUMERIC\020\007\022\031\n\025MT_NUMER" + + "IC_WITH_UNITS\020\010\022\033\n\027MT_ONLY_ALPHABETS_EXA" + + "CT\020\t\022\033\n\027MT_ONLY_ALPHABETS_FUZZY\020\n*\314\001\n\nDa" + + "taFormat\022\n\n\006DF_CSV\020\000\022\016\n\nDF_PARQUET\020\001\022\013\n\007" + + "DF_JSON\020\002\022\013\n\007DF_TEXT\020\003\022\n\n\006DF_XLS\020\004\022\013\n\007DF" + + "_AVRO\020\005\022\013\n\007DF_JDBC\020\006\022\020\n\014DF_CASSANDRA\020\007\022\020" + + "\n\014DF_SNOWFLAKE\020\010\022\016\n\nDF_ELASTIC\020\t\022\r\n\tDF_E" + + "XACOL\020\n\022\016\n\nDF_BIGQUEY\020\013\022\017\n\013DF_INMEMORY\020\014" + + "B\035\n\031zingg.spark.connect.protoP\001b\006proto3" }; descriptor = com.google.protobuf.Descriptors.FileDescriptor .internalBuildGeneratedFileFrom(descriptorData,