Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ public List<SearchResult> find(Entity entity) {
}
String htmlPage = new String(resultBuff);
log.debug("parsing html page content...");
//System.out.println(htmlPage.toString());
return parseHTML(htmlPage);

} catch (MalformedURLException ex) {
Expand All @@ -152,13 +153,16 @@ public List<SearchResult> parseHTML(String html) {
String title = "";
String identifier = "";
//TODO: search for tag "a" first to limit elements to search by attribute value?
Elements names = hit.getElementsByAttributeValueMatching("href", "/dara/search/search_show?.*");
Elements names = hit.getElementsByAttributeValueMatching("href", "/dara/search/search_show?.*");
Elements dois = hit.getElementsByAttributeValueContaining("href", "https://doi.org");
// each entry has exactly one name and one doi element
//TODO: except for some datasets that are not registered but only referenced in dara!
// e.g. "OECD Employment Outlook" -> no doi listed here -> ignored
for (Element name : names) {
title = name.text().trim();
//title = name.text().trim(); // PROBLEM!!
if (!name.childNodes().get(0).toString().trim().equals("") && !name.childNodes().get(0).toString().trim().equals("English") && !name.childNodes().get(0).toString().trim().equals("German")){
title = name.childNodes().get(0).toString().trim();
}
}
for (Element doi : dois) {
identifier = doi.text().trim();
Expand All @@ -167,6 +171,7 @@ public List<SearchResult> parseHTML(String html) {
continue;
}
//create the search result
//System.out.print("Creating search result: title: " + title + "; identifier: " + identifier + "\r");
log.debug("Creating search result: title: " + title + "; identifier: " + identifier);
List<String> numericInfo = InformationExtractor.getNumericInfo(title);
SearchResult sr = new SearchResult();
Expand Down
104 changes: 55 additions & 49 deletions src/test/java/io/github/infolis/algorithm/FederatedSearcherTest.java
Original file line number Diff line number Diff line change
@@ -1,49 +1,55 @@
package io.github.infolis.algorithm;

import static org.junit.Assert.assertEquals;
import io.github.infolis.InfolisBaseTest;
import io.github.infolis.model.Execution;
import io.github.infolis.model.entity.Entity;
import io.github.infolis.model.entity.SearchResult;
import io.github.infolis.infolink.querying.DaraHTMLQueryService;
import io.github.infolis.infolink.querying.QueryService;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;

/**
*
* @author kata
*
*/
public class FederatedSearcherTest extends InfolisBaseTest {

@Test
public void testDaraQueryServices() throws IOException {
Execution execution = new Execution();
Entity entity = new Entity();
entity.setName("Studierendensurvey");
List<String> numInfo = new ArrayList<>();
numInfo.add("2012/13");
entity.setNumericInfo(numInfo);
dataStoreClient.post(Entity.class, entity);
execution.setLinkedEntities(Arrays.asList(entity.getUri()));
QueryService queryService = new DaraHTMLQueryService();
dataStoreClient.post(QueryService.class, queryService);
execution.setQueryServices(Arrays.asList(queryService.getUri(), queryService.getUri()));
execution.setAlgorithm(FederatedSearcher.class);
execution.setSearchResultLinkerClass(BestMatchLinker.class);
Algorithm algo = execution.instantiateAlgorithm(dataStoreClient, dataStoreClient, fileResolver, fileResolver);
algo.run();

List<SearchResult> searchResults = dataStoreClient.get(SearchResult.class, execution.getSearchResults());
// since the query service is given twice, FederatedSearcher should find the same result twice
assertEquals(2, searchResults.size());
assertEquals("10.4232/1.5126", searchResults.get(0).getIdentifier());
assertEquals("10.4232/1.5126", searchResults.get(1).getIdentifier());
}

}
package io.github.infolis.algorithm;

import static org.junit.Assert.assertEquals;
import io.github.infolis.InfolisBaseTest;
import io.github.infolis.model.Execution;
import io.github.infolis.model.entity.Entity;
import io.github.infolis.model.entity.SearchResult;
import io.github.infolis.infolink.querying.DaraHTMLQueryService;
import io.github.infolis.infolink.querying.QueryService;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;

/**
*
* @author kata
*
*/
public class FederatedSearcherTest extends InfolisBaseTest {

@Test
public void testDaraQueryServices() throws IOException {
Execution execution = new Execution();
Entity entity = new Entity();
entity.setName("Studierendensurvey");
List<String> numInfo = new ArrayList<>();
numInfo.add("2012/13");
entity.setNumericInfo(numInfo);
dataStoreClient.post(Entity.class, entity);
execution.setLinkedEntities(Arrays.asList(entity.getUri()));
QueryService queryService = new DaraHTMLQueryService();
dataStoreClient.post(QueryService.class, queryService);
execution.setQueryServices(Arrays.asList(queryService.getUri(), queryService.getUri()));
execution.setAlgorithm(FederatedSearcher.class);
execution.setSearchResultLinkerClass(BestMatchLinker.class);
Algorithm algo = execution.instantiateAlgorithm(dataStoreClient, dataStoreClient, fileResolver, fileResolver);
algo.run();

List<SearchResult> searchResults = dataStoreClient.get(SearchResult.class, execution.getSearchResults());
// since the query service is given twice, FederatedSearcher should find the same result twice
assertEquals(2, searchResults.size());

//Update from 10.4232/1.5126 to 10.4232/1.12967
//assertEquals("10.4232/1.5126", searchResults.get(0).getIdentifier());
//assertEquals("10.4232/1.5126", searchResults.get(1).getIdentifier());

assertEquals("10.4232/1.12967", searchResults.get(0).getIdentifier());
assertEquals("10.4232/1.12967", searchResults.get(1).getIdentifier());

}

}
18 changes: 10 additions & 8 deletions src/test/java/io/github/infolis/algorithm/ReferenceLinkerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ public void testExecute() throws IOException {
EntityLink link2 = dataStoreClient.get(EntityLink.class, linkUris.get(1));
Entity toEntity2 = dataStoreClient.get(Entity.class, link2.getToEntity());
Entity fromEntity2 = dataStoreClient.get(Entity.class, link2.getFromEntity());
assertEquals("Studiensituation und studentische Orientierungen 2012/13 (Studierenden-Survey)", toEntity2.getName());
assertEquals("10.4232/1.5126", toEntity2.getIdentifiers().get(0));
// Einzukommentieren! //assertEquals("Studiensituation und studentische Orientierungen 2012/13 (Studierenden-Survey)", toEntity2.getName());
//assertEquals("10.4232/1.5126", toEntity2.getIdentifiers().get(0));
assertEquals("10.4232/1.12967", toEntity2.getIdentifiers().get(0));

assertEquals("Studierendensurvey", toEntity1.getName());
assertEquals(Arrays.asList("2012/13"), toEntity1.getNumericInfo());
assertEquals(toEntity1.getUri(), fromEntity2.getUri());
Expand All @@ -74,10 +76,10 @@ public void testExecute() throws IOException {
toEntity2 = dataStoreClient.get(Entity.class, links.get(2).getToEntity());
Entity toEntity3 = dataStoreClient.get(Entity.class, links.get(3).getToEntity());
assertTrue(Arrays.asList(toEntity1.getName(), toEntity2.getName(), toEntity3.getName()).contains("Studiensituation und studentische Orientierungen 2012/13 (Studierenden-Survey)"));
assertTrue(Arrays.asList(toEntity1.getName(), toEntity2.getName(), toEntity3.getName()).contains("Studiensituation und studentische Orientierungen (Studierenden-Survey) Kumulation 1983 - 2013"));
assertTrue(Arrays.asList(toEntity1.getName(), toEntity2.getName(), toEntity3.getName()).contains("Studiensituation und studentische Orientierungen (Studierenden-Survey) Kumulation 1983 - 2016"));
assertTrue(Arrays.asList(toEntity1.getIdentifiers().get(0), toEntity2.getIdentifiers().get(0), toEntity3.getIdentifiers().get(0)).contains("10.4232/1.5126"));
assertTrue(Arrays.asList(toEntity1.getIdentifiers().get(0), toEntity2.getIdentifiers().get(0), toEntity3.getIdentifiers().get(0)).contains("10.4232/1.12510"));
assertTrue(Arrays.asList(toEntity1.getIdentifiers().get(0), toEntity2.getIdentifiers().get(0), toEntity3.getIdentifiers().get(0)).contains("10.4232/1.12494"));
assertTrue(Arrays.asList(toEntity1.getIdentifiers().get(0), toEntity2.getIdentifiers().get(0), toEntity3.getIdentifiers().get(0)).contains("10.4232/1.5126"));

Execution exec3 = new Execution();
TextualReference reference2 = new TextualReference("In this snippet, the reference", "Studierendensurvey", "of any year is to", infolisFile.getUri(), "pattern", infolisFile.getUri());
Expand Down Expand Up @@ -116,13 +118,13 @@ public void testExecute() throws IOException {
assertEquals(4, linkUris.size());
EntityLink link5 = dataStoreClient.get(EntityLink.class, linkUris.get(1));
Entity toEntity5 = dataStoreClient.get(Entity.class, link5.getToEntity());
assertEquals("Studiensituation und studentische Orientierungen 2012/13 (Studierenden-Survey)", toEntity5.getName());
assertEquals("10.4232/1.5126", toEntity5.getIdentifiers().get(0));
assertEquals("Studiensituation und studentische Orientierungen 2015/16 (Studierenden-Survey)", toEntity5.getName());
assertEquals("10.4232/1.12967", toEntity5.getIdentifiers().get(0));

EntityLink link5b = dataStoreClient.get(EntityLink.class, linkUris.get(1));
Entity toEntity5b = dataStoreClient.get(Entity.class, link5b.getToEntity());
assertEquals("Studiensituation und studentische Orientierungen 2012/13 (Studierenden-Survey)", toEntity5b.getName());
assertEquals("10.4232/1.5126", toEntity5b.getIdentifiers().get(0));
assertEquals("Studiensituation und studentische Orientierungen 2015/16 (Studierenden-Survey)", toEntity5b.getName());
assertEquals("10.4232/1.12967", toEntity5b.getIdentifiers().get(0));

// no matching entries in dara
Execution exec6 = new Execution();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,80 +1,83 @@
package io.github.infolis.algorithm;

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.github.infolis.InfolisBaseTest;
import io.github.infolis.model.Execution;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.Entity;
import io.github.infolis.model.entity.EntityLink;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.infolink.querying.DaraHTMLQueryService;
import io.github.infolis.infolink.querying.QueryService;

/**
*
* @author kata
*
*/
public class SearchDoisAndCreateLinksTest extends InfolisBaseTest {

private final String[] testString = {
"Version 1.0.0, 21.03.2013 erste Archiv-Version doi:10.4232/1.11692. Änderungen in dieser Version. 2013-11-21, Fehler in Antwortskala für V749 OVERALL ..."
};
private List<String> uris = new ArrayList<>();

public SearchDoisAndCreateLinksTest() throws Exception {
List<InfolisFile> inputFiles = createTestTextFiles(1, testString);
for (InfolisFile file : inputFiles) {
uris.add(file.getUri());
}
}


private static final Logger log = LoggerFactory.getLogger(SearchDoisAndCreateLinksTest.class);

@Test
public void testSearchDoisAndCreateLinks() throws IOException {
List<String> qServices = postQueryServices();
Execution e = new Execution();
e.setAlgorithm(SearchDoisAndCreateLinks.class);
e.setInputFiles(uris);
e.setQueryServices(qServices);
dataStoreClient.post(Execution.class, e);
e.instantiateAlgorithm(dataStoreClient, fileResolver).run();
for (String ref : e.getTextualReferences()) {
log.debug("created textual reference: " + ref);
}

List<EntityLink> createdLinks = dataStoreClient.get(EntityLink.class, e.getLinks());
assertEquals(2, createdLinks.size());

EntityLink el = createdLinks.get(0);
log.debug("created link from " + el.getFromEntity() + " to " + el.getToEntity());
TextualReference textRef = dataStoreClient.get(TextualReference.class, el.getLinkReason());
assertEquals("infolisFile_1", textRef.getTextFile());

el = createdLinks.get(1);
Entity targetEntity = dataStoreClient.get(Entity.class, el.getToEntity());
assertEquals("German General Social Survey - ALLBUS 2010", targetEntity.getName());
assertEquals("10.4232/1.11692", (targetEntity.getIdentifiers().get(0)));
}

public List<String> postQueryServices() throws IOException {
List<String> postedQueryServices = new ArrayList<>();
QueryService p1 = new DaraHTMLQueryService();
p1.setMaxNumber(10);
dataStoreClient.post(QueryService.class, p1);
postedQueryServices.add(p1.getUri());
return postedQueryServices;
}

}
package io.github.infolis.algorithm;

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.github.infolis.InfolisBaseTest;
import io.github.infolis.model.Execution;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.Entity;
import io.github.infolis.model.entity.EntityLink;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.infolink.querying.DaraHTMLQueryService;
import io.github.infolis.infolink.querying.QueryService;

/**
*
* @author kata
*
*/
public class SearchDoisAndCreateLinksTest extends InfolisBaseTest {

private final String[] testString = {
//"Version 1.0.0, 21.03.2013 erste Archiv-Version doi:10.4232/1.11692. Änderungen in dieser Version. 2013-11-21, Fehler in Antwortskala für V749 OVERALL ..."
"Version 1.0.1 (aktuelle Version) 21.11.2013 Antwortskala für V749 OVERALL LIFE SATISFACTION korrigiert doi:10.4232/1.11782"
};
private List<String> uris = new ArrayList<>();

public SearchDoisAndCreateLinksTest() throws Exception {
List<InfolisFile> inputFiles = createTestTextFiles(1, testString);
for (InfolisFile file : inputFiles) {
uris.add(file.getUri());
}
}


private static final Logger log = LoggerFactory.getLogger(SearchDoisAndCreateLinksTest.class);

@Test
public void testSearchDoisAndCreateLinks() throws IOException {
List<String> qServices = postQueryServices();
Execution e = new Execution();
e.setAlgorithm(SearchDoisAndCreateLinks.class);
e.setInputFiles(uris);
e.setQueryServices(qServices);
dataStoreClient.post(Execution.class, e);
e.instantiateAlgorithm(dataStoreClient, fileResolver).run();
for (String ref : e.getTextualReferences()) {
log.debug("created textual reference: " + ref);
}

List<EntityLink> createdLinks = dataStoreClient.get(EntityLink.class, e.getLinks());
assertEquals(2, createdLinks.size());

EntityLink el = createdLinks.get(0);
log.debug("created link from " + el.getFromEntity() + " to " + el.getToEntity());
TextualReference textRef = dataStoreClient.get(TextualReference.class, el.getLinkReason());
assertEquals("infolisFile_1", textRef.getTextFile());

el = createdLinks.get(1);
Entity targetEntity = dataStoreClient.get(Entity.class, el.getToEntity());
assertEquals("German General Social Survey - ALLBUS 2010", targetEntity.getName());
//assertEquals("10.4232/1.11692", (targetEntity.getIdentifiers().get(0)));
assertEquals("10.4232/1.11782", (targetEntity.getIdentifiers().get(0)));

}

public List<String> postQueryServices() throws IOException {
List<String> postedQueryServices = new ArrayList<>();
QueryService p1 = new DaraHTMLQueryService();
p1.setMaxNumber(10);
dataStoreClient.post(QueryService.class, p1);
postedQueryServices.add(p1.getUri());
return postedQueryServices;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,7 @@ public void testMetaExtraction() throws Exception {
Entity e = dataStoreClient.get(Entity.class, inFile.getManifestsEntity());
assertEquals("The possible trinity: Optimal interest rate, exchange rate, and taxes on capital flows in a DSGE model for a small open economy", e.getName());
assertEquals(1, e.getAuthors().size());
assertEquals("A traditional way of thinking about the exchange rate (XR) regime and capital account openness has been framed in "
+ "terms of the 'impossible trinity' or 'trilemma', in which policymakers can only have 2 of 3 possible outcomes: open capital markets, monetary "
+ "independence and pegged XRs. This paper is an extension of Escude (A DSGE Model for a SOE with Systematic Interest and Foreign Exchange Policies "
+ "in Which Policymakers Exploit the Risk Premium for Stabilization Purposes, 2013), which focuse (...)",//d on interest rate and XR policies, since it introduces"
// + " the third vertex of the 'trinity' in the form of taxes on private foreign debt. These affect the risk-adjusted uncovered interest parity equation and"
//+ " hence influence the SOE's international financial flows. A useful way to illustrate the range of policy alternatives is to associate them with the"
//+ " faces of a triangle. Each of 3 possible government intervention policies taken individually (in the domestic currency bond market, in the FX market,"
//+ " and in the foreign currency bonds market) corresponds to one of the vertices of the triangle, each of the 3 possible pairs of intervention policies"
//+ " corresponds to one of its 3 edges, and the 3 simultaneous intervention policies taken jointly correspond to its interior. This paper shows that this "
//+ "interior, or 'possible trinity' is quite generally not only possible but optimal, since the CB obtains a lower loss when it implements a policy with"
//+ " all three interventions.",
e.getAbstractText());
assertEquals("A traditional way of thinking about the exchange rate (XR) regime and capital account openness has been framed in terms of the 'impossible trinity' or 'trilemma', in which policymakers can only have 2 of 3 possible outcomes: open capital markets, monetary independence and pegged XRs. This paper is an extension of Escude (A DSGE Model for a SOE with Systematic Interest and Foreign Exchange Policies in Which Policymakers Exploit the Risk Premium for Stabilization Purposes, 2013), which focuse (...)",e.getAbstractText().toString());
assertEquals(9, e.getSubjects().size());
log.debug("ids: " + e.getIdentifiers());
assertEquals(Arrays.asList(
Expand Down
Loading