diff --git a/.gitignore b/.gitignore
index dfcef86c..8c4ab2c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,9 @@ run/
nb-configuration.xml
nbactions.xml
+.classpath
+.factorypath
+.project
+.settings
+.vscode
+.DS_Store
diff --git a/.travis.yml b/.travis.yml
index 9faa181d..2e9b4f53 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
sudo: false
language: java
jdk:
-- openjdk8
+- openjdk11
diff --git a/README.md b/README.md
index 42c7ba51..c49ca89a 100644
--- a/README.md
+++ b/README.md
@@ -25,10 +25,10 @@ OAI-PMH **endpoint**.
# Building
-Building this app requires JDK 8 and Apache Maven. It can be built
+Building this app requires JDK 11 and Apache Maven. It can be built
simply using the command:
-```mvn clean package assembly:assembly```
+```mvn clean install```
If you use a Java IDE, it is highly likely it also offers a simple way
to do the above.
@@ -37,7 +37,7 @@ You can also use the `build.sh` script to run a build within an environment
provisioned with suitable versions of the JDK and Maven. Requires docker.
The above build process creates a package named
-`oai-harvest-manager-x.y.z.tar.gz` (where x.y.z is a version number).
+`target/oai-harvest-manager-x.y.z.tar.gz` (where x.y.z is a version number).
# Running the Application
@@ -60,6 +60,8 @@ override the timeout value defined in `config.xml`, if any. The first
parameter that does not contain = is taken as the configuration file
name.
+If you used `build.sh` to run a build you can use `run.sh config.xml` to run this build
+
# Configuration
@@ -77,8 +79,9 @@ file. The configuration file is composed of four sections:
listed.
To get a clear idea of the structure of the configuration file, see
-the [sample configuration files](src/main/resources) in juxtaposition
-with the explanation for each section below.
+the [sample configuration files](src/main/resources) or the
+[CLARIN configuration files](https://github.com/clarin-eric/oai-harvest-config) in
+juxtaposition with the explanation for each section below.
## Configuring Settings
@@ -142,7 +145,11 @@ action types are available:
- The *transform* action applies a mapping, defined in an XSLT file,
to the metadata record. This can be used, among other things, for
semantic mapping between metadata schemata. See the included
- configuration files for an example.
+ configuration files for an example. The XSLT recieves various parameters:
+ 1. ```config``` the configuration file used
+ 2. ```provider_name``` the provider name
+ 3. ```provider_uri``` the endpoint
+ 4. ```record_identifier``` the id of the record to transform
For each provider, the first format definition that the provider
supports will determine the action sequence to be executed. If one of
@@ -174,6 +181,10 @@ For each provider, the following can be defined:
delay and timeout) can be overwritten for a specific provider by
adding them as attributes to the provider element.
+- The attribute *exclusive*, when set to true, indicates that the
+ provider should be harvested on its own, i.e. no other harvesting threads
+ should be active, this can be used when a provider has some huge records.
+
- The provider element may contain multiple *set* child elements,
which specify the names of OAI-PMH sets to be harvested.
@@ -182,8 +193,10 @@ a *centre registry*. So far, this registry is only used by the CLARIN community.
The registry is specified by its URL. All the provider endpoints defined in the
registry will be harvested. Sometimes, it might be necessary to exclude an
endpoint from the ones defined in the registry. This can be done by specifying
-its URL in the configuration file used for harvesting. Please review the
-instructions in the configuration files supplied in the package.
+its URL in the configuration file used for harvesting. In other cases
+an endpoint loaded from the registry needs its specific configuration timeout,
+this can be done in a similar vain as excluding. Please review the
+instructions in the configuration files supplied in the package.
# Static Providers
@@ -222,10 +235,6 @@ convenient for debugging specific providers.
# Implementation Notes
-Saxon is used as the XPath engine, although only standard APIs are
-used and hence changing to a different XPath processor would be
-trivial.
-
Processing for each provider runs in a separate thread. It is not
possible to target a single provider with multiple threads (except in
the special case where sets are used; then it is possible to mention
@@ -259,9 +268,4 @@ action actionSequences, and 5 each for the directories ```cmdi``` and
The pooling implementation is particularly important when
transformations are used, as preparing a transformation object
-involves parsing the XSLT, potentially a time-consuming process.
-
-
-# Build Status
-
-[![Build Status](https://travis-ci.org/TheLanguageArchive/oai-harvest-manager.png?branch=master)](https://travis-ci.org/TheLanguageArchive/oai-harvest-manager)
+involves parsing the XSLT, potentially a time-consuming process.
\ No newline at end of file
diff --git a/assembly.xml b/assembly.xml
index 72f0b322..a7b47641 100644
--- a/assembly.xml
+++ b/assembly.xml
@@ -12,11 +12,9 @@
- config*.xml
- oai2.xsl
+ config*.xml
addOAISetName.xsl
- olac2cmdi.xsl
- sil_to_iso6393.xml
+ filter.xsl
-
+
+
+
+ CLARIN
+ CLARIN Repository
+ https://nexus.clarin.eu/content/repositories/Clarin
+
+ false
+
+
+
+ CLARIN-Snapshot
+ CLARIN Snapshot Repository
+ https://nexus.clarin.eu/content/repositories/clarin-snapshot
+
+ true
+
+
+
-
- ibiblio
- ibiblio.org
- http://mirrors.ibiblio.org/pub/mirrors/maven2
-
-
+
+
-
-
-
- org.apache.logging.log4j
- log4j-slf4j-impl
- 2.13.1
-
+
+ org.glassfish.jaxb
+ codemodel
+ 3.0.0-M4
+
+
+ org.glassfish.jaxb
+ txw2
+ 3.0.0-M4
+
+
+ com.sun.xsom
+ xsom
+ 20140925
+
+
+ com.sun.istack
+ 4.0.0-M3
+ istack-commons-runtime
+
+
+ com.sun.xml.bind.external
+ rngom
+ 3.0.0-M4
+
-
- net.sf.saxon
- Saxon-HE
- 9.5.1-8
-
-
- xalan
- xalan
- 2.7.2
-
+
+ org.apache.logging.log4j
+ log4j-slf4j-impl
+ 2.13.3
+
+
+ org.slf4j
+ log4j-over-slf4j
+ 1.7.30
+
-
- junit
- junit
- 4.13
- test
-
+
+ net.sf.saxon
+ Saxon-HE
+ 9.5.1-8
+
-
- joda-time
- joda-time
- 2.2
-
+
+ nl.mpi.tla
+ SaxonUtils
+ 1.0-SNAPSHOT
+ jar
+
-
- commons-io
- commons-io
- 2.5
-
+
+ xalan
+ xalan
+ 2.7.2
+
-
- org.mockito
- mockito-all
- 1.9.5
-
+
+ junit
+ junit
+ 4.13.1
+ test
+
-
- org.codehaus.woodstox
- woodstox-core-asl
- 4.2.0
- jar
-
+
+ joda-time
+ joda-time
+ 2.10.6
+
+
+
+ commons-io
+ commons-io
+ 2.7
+
+
+
+ org.mockito
+ mockito-core
+ 3.4.6
+
+
+
+
+ org.codehaus.woodstox
+ woodstox-core-asl
+ 4.4.1
+ jar
+
-
- com.github.tomakehurst
- wiremock-jre8
- 2.24.1
- test
-
-
- com.google.guava
- guava
- 28.2-jre
-
-
+
+ com.jayway.jsonpath
+ json-path
+ 2.4.0
+
+
+
+ com.github.tomakehurst
+ wiremock-jre8
+ 2.27.1
+ test
+
+
+ com.google.guava
+ guava
+ 29.0-jre
+
+
+
+
+ javax.annotation
+ javax.annotation-api
+ 1.3.2
+
+
+ javax.xml.bind
+ jaxb-api
+ 2.4.0-b180725.0427
+
+
+ org.glassfish.jaxb
+ jaxb-runtime
+ 2.4.0-b180725.0644
+
+
+
+
+
+
+ ${project.artifactId}-${versionNumber}
+
+
+
+ src/test/resources
+
+ **
+
+
+
-
-
- ${project.artifactId}-${versionNumber}
-
-
-
- src/test/resources
-
- **
-
-
-
+
+
+
+
+ maven-deploy-plugin
+ 3.0.0-M1
+
+ true
+
+
+
+
-
-
-
-
- maven-deploy-plugin
-
- true
-
-
-
-
+
-
+
+ org.apache.maven.plugins
+ maven-enforcer-plugin
+ 3.0.0-M3
+
+
+ enforce-maven
+
+ enforce
+
+
+
+
+ 3.0.5
+
+
+
+
+
+
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.1
-
-
- 1.8
-
-
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+ 11
+
+
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 2.10.3
-
- src/main/java;target/generated-sources/jaxb
-
-
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.2.0
+
+ src/main/java;target/generated-sources/jaxb
+
+
-
-
- org.apache.maven.plugins
- maven-dependency-plugin
-
-
- copy-dependencies
- package
-
- copy-dependencies
-
-
- target/lib
-
- true
- true
-
-
-
-
-
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy-dependencies
+ package
+
+ copy-dependencies
+
+
+ target/lib
+ true
+ true
+
+
+
+
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 2.5
-
- target/classes
-
-
- lib/
- true
- nl.mpi.oai.harvester.control.Main
-
-
- ${user.name}
- ${maven.build.timestamp}
- ${project.version}
- ${buildNumber}
- ${project.artifactId}-${versionNumber}.jar
- .
-
-
-
- **/log4j.properties
-
-
-
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.2.0
+
+ target/classes
+
+
+ lib/
+ true
+ false
+ nl.mpi.oai.harvester.control.Main
+
+
+ ${user.name}
+ ${maven.build.timestamp}
+ ${project.version}
+ ${buildNumber}
+ ${project.artifactId}-${versionNumber}.jar
+ .
+
+
+
+ **/log4j.properties
+
+
+
-
-
- org.apache.maven.plugins
- maven-assembly-plugin
- 2.4
-
- ${project.artifactId}-${versionNumber}
- false
-
- assembly.xml
-
-
-
+
+
+ org.apache.maven.plugins
+ maven-assembly-plugin
+ 3.3.0
+
+ ${project.artifactId}-${versionNumber}
+ false
+
+ assembly.xml
+
+
+
+
+ make-assembly
+ package
+
+ single
+
+
+
+
-
-
- org.codehaus.mojo
- buildnumber-maven-plugin
- 1.2
-
-
- validate
-
- create
-
-
- none
- 6
-
-
-
-
- false
- false
-
-
+
+
+ org.codehaus.mojo
+ buildnumber-maven-plugin
+ 1.4
+
+
+ validate
+
+ create
+
+
+ none
+ 6
+
+
+
+
+ false
+ false
+
+
-
-
- org.codehaus.mojo
- jaxb2-maven-plugin
-
- 2.4
-
-
- xjc
-
- xjc
-
-
-
-
-
- nl.mpi.oai.harvester.generated
-
-
-
+
+
+ org.codehaus.mojo
+ jaxb2-maven-plugin
+
+ 2.5.0
+
+
+ xjc
+
+ xjc
+
+
+
+
+ nl.mpi.oai.harvester.generated
+
+
-
-
- org.codehaus.mojo
- findbugs-maven-plugin
- 3.0.0
-
- Normal
- Min
-
-
-
-
+
+
+ org.codehaus.mojo
+ findbugs-maven-plugin
+ 3.0.5
+
+ Normal
+ Min
+
+
+
+
-
-
-
+
+
+
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 2.10.3
-
-
-
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.2.0
+
+
+
diff --git a/run.sh b/run.sh
index e6e5aa12..7f177dfb 100755
--- a/run.sh
+++ b/run.sh
@@ -1,8 +1,8 @@
#!/bin/bash
JAVA_TARGET_DIR="$(cd "$(dirname $0)" && pwd)/target"
-JAVA_IMAGE=registry.gitlab.com/clarin-eric/docker-alpine-supervisor-java-base:openjdk8-1.2.5
+JAVA_IMAGE=registry.gitlab.com/clarin-eric/docker-alpine-supervisor-java-base:openjdk11-1.2.12
CONTAINER_CONF_FILE_PATH='/tmp/harvester.conf'
-JAVA_CMD="java -Dlogdir=/logdir -jar /java-bin/oai-harvest-manager*.jar ${CONTAINER_CONF_FILE_PATH}"
+JAVA_CMD="java -Dlogdir=/logdir -jar /java-bin/oai-harvest-manager*.jar workdir=/workdir ${CONTAINER_CONF_FILE_PATH}"
WORKDIR="${WORKDIR:-$(pwd)/run/workdir}"
LOGDIR="${LOGDIR:-$(pwd)/run/log}"
CONFIG_FILE="$1"
diff --git a/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java b/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java
index 09b1f9ca..68587b34 100644
--- a/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java
+++ b/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java
@@ -326,7 +326,7 @@ public void harvest(String requestURL, int timeout, Path temp) throws MalformedU
retrySeconds = retryDate - now;
}
if (retrySeconds == 0) { // Apparently, it's a bad URL
- throw new FileNotFoundException("Bad URL?");
+ throw new FileNotFoundException("Bad URL["+requestURL+"]?");
}
logger.debug("Retry-After=" + retrySeconds);
if (retrySeconds > 0) {
diff --git a/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java b/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java
index 70e8652f..f1bf7938 100644
--- a/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java
+++ b/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java
@@ -18,6 +18,7 @@
package ORG.oclc.oai.harvester2.verb;
+import com.ctc.wstx.exc.WstxUnexpectedCharException;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
@@ -150,16 +151,22 @@ public String getResumptionToken()
}
break;
}
+
+ outer:
if (xmlr.hasNext())
- xmlr.next();
+ try {
+ xmlr.next();
+ } catch (WstxUnexpectedCharException ex) {
+ logger.info(String.format("Invalid char found in XML, skipping the current one and look for next one: {%s}", xmlr.toString()));
+ }
else
state = state == 1? 0: -1;// if START then STOP else ERROR
}
if (state < 0 || token == null) {
- logger.warn("couldn't find token in the XML stream!");
+ logger.debug("couldn't find token, done!");
return null;
}
- logger.debug("found token["+token+"] in the XML stream!");
+ logger.debug("found token["+token+"], resume!");
return token;
} else if (schemaLocation.indexOf(SCHEMA_LOCATION_V1_1_LIST_RECORDS) != -1) {
return getSingleString("/oai11_ListRecords:ListRecords/oai11_ListRecords:resumptionToken");
diff --git a/src/main/java/nl/mpi/oai/harvester/Provider.java b/src/main/java/nl/mpi/oai/harvester/Provider.java
index 5aed22be..87b649c7 100644
--- a/src/main/java/nl/mpi/oai/harvester/Provider.java
+++ b/src/main/java/nl/mpi/oai/harvester/Provider.java
@@ -196,7 +196,7 @@ public void setDeletionMode(DeletionMode deletionMode) {
this.deletionMode = deletionMode;
}
- /**
+ /**
* Set the name of this provider
*
* @param name name of provider
@@ -223,6 +223,10 @@ public String getOaiUrl() {
return oaiUrl;
}
+ public boolean hasSets() {
+ return (sets!=null && 0<=sets.length);
+ }
+
public String[] getSets() {
return sets;
}
@@ -415,7 +419,7 @@ public Metadata getRecord(String id, String mdPrefix) {
int retryDelay = getRetryDelay(i);
if (retryDelay > 0) {
try {
- Thread.sleep(retryDelay);
+ Thread.sleep(retryDelay*1000);
} catch(InterruptedException e) {
logger.error(e.getMessage(), e);
}
diff --git a/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java b/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java
index dc70a3d8..2eb0017e 100644
--- a/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java
+++ b/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java
@@ -18,23 +18,31 @@
package nl.mpi.oai.harvester.action;
-import net.sf.saxon.Configuration;
-import net.sf.saxon.TransformerFactoryImpl;
+import net.sf.saxon.s9api.DOMDestination;
+import net.sf.saxon.s9api.QName;
+import net.sf.saxon.s9api.SaxonApiException;
+import net.sf.saxon.s9api.XdmAtomicValue;
+import net.sf.saxon.s9api.XdmNode;
+import net.sf.saxon.s9api.XsltTransformer;
import nl.mpi.oai.harvester.metadata.Metadata;
+import nl.mpi.tla.util.Saxon;
+import nl.mpi.tla.util.SaxonListener;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.w3c.dom.Document;
+import org.w3c.dom.Node;
-import javax.xml.transform.*;
-import javax.xml.transform.dom.DOMResult;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.Source;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.URIResolver;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXSource;
-import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
@@ -43,7 +51,10 @@
import java.nio.file.Path;
import java.util.List;
import java.util.concurrent.Semaphore;
-import org.w3c.dom.Node;
+import javax.xml.transform.ErrorListener;
+import javax.xml.transform.SourceLocator;
+import net.sf.saxon.s9api.MessageListener;
+import net.sf.saxon.s9api.XsltExecutable;
/**
* This class represents the application of an XSL transformation to the
@@ -53,6 +64,9 @@
*/
public class TransformAction implements Action {
private static final Logger logger = LogManager.getLogger(TransformAction.class);
+
+ /** The XSL executable. */
+ private final XsltExecutable executable;
/** The file containing the XSL transformation. */
private String xsltFile;
@@ -60,12 +74,6 @@ public class TransformAction implements Action {
/** The directory containing cached resources. */
private Path cacheDir;
- /** Transformer factory */
- TransformerFactory factory;
-
- /** Prepared XSL transformation object. */
- private Templates templates;
-
/** A standard semaphore is used to track the number of running transforms. */
private Semaphore semaphore;
@@ -80,8 +88,11 @@ public class TransformAction implements Action {
* @param maxJobs the maximum number of concurrent transforms
* @throws FileNotFoundException stylesheet couldn't be found
* @throws TransformerConfigurationException there is a problem with the stylesheet
+ * @throws java.net.MalformedURLException
+ * @throws net.sf.saxon.s9api.SaxonApiException
*/
- public TransformAction(Node conf, String xsltFile,Path cacheDir,int maxJobs) throws FileNotFoundException, TransformerConfigurationException {
+ public TransformAction(Node conf, String xsltFile,Path cacheDir,int maxJobs)
+ throws FileNotFoundException, TransformerConfigurationException, MalformedURLException, SaxonApiException {
this(conf, xsltFile,cacheDir,(maxJobs>0?new Semaphore(maxJobs):null));
}
@@ -93,30 +104,23 @@ public TransformAction(Node conf, String xsltFile,Path cacheDir,int maxJobs) thr
* @param semaphore a semaphore to control the concurrent number of transforms
* @throws FileNotFoundException stylesheet couldn't be found
* @throws TransformerConfigurationException there is a problem with the stylesheet
+ * @throws java.net.MalformedURLException
+ * @throws net.sf.saxon.s9api.SaxonApiException
*/
- public TransformAction(Node conf, String xsltFile,Path cacheDir,Semaphore semaphore) throws FileNotFoundException, TransformerConfigurationException {
+ public TransformAction(Node conf, String xsltFile,Path cacheDir,Semaphore semaphore)
+ throws FileNotFoundException, TransformerConfigurationException, MalformedURLException, SaxonApiException {
this.config = conf;
- this.xsltFile = xsltFile;
+ this.xsltFile = xsltFile;
this.cacheDir = cacheDir;
this.semaphore = semaphore;
- factory = TransformerFactory.newInstance("net.sf.saxon.TransformerFactoryImpl", null);
- if(factory instanceof TransformerFactoryImpl) {
- TransformerFactoryImpl transformerFactoryImpl = ((TransformerFactoryImpl)factory);
- logger.debug("Telling Saxon to send messages as warnings to logger");
- final Configuration tfConfig = transformerFactoryImpl.getConfiguration();
- tfConfig.setMessageEmitterClass("net.sf.saxon.serialize.MessageWarner");
- if (cacheDir != null) {
- logger.debug("Setting the URLResolve to cache in "+cacheDir);
- transformerFactoryImpl.setURIResolver(new TransformActionURLResolver(transformerFactoryImpl.getURIResolver()));
- }
- }
- factory.setErrorListener(new TransformActionErrorListener());
Source xslSource = null;
if (xsltFile.startsWith("http:") || xsltFile.startsWith("https:"))
xslSource = new StreamSource(xsltFile);
else
xslSource = new StreamSource(new FileInputStream(xsltFile),xsltFile);
- templates = factory.newTemplates(xslSource);
+
+ executable = Saxon.buildTransformer(Saxon.buildDocument(xslSource));
+
}
@Override
@@ -133,30 +137,38 @@ public boolean perform(List records) {
} catch (InterruptedException e) { }
}
}
- Transformer transformer = templates.newTransformer();
Source source = null;
- Result output = null;
+ Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
+ DOMDestination output = new DOMDestination(doc);
if (record.hasStream()) {
source = new SAXSource(record.getSource());
- output = new StreamResult(new ByteArrayOutputStream());
} else {
source = new DOMSource(record.getDoc());
- output = new DOMResult();
}
- transformer.setParameter("config", this.config.getOwnerDocument());
- transformer.setParameter("provider_name",record.getOrigin().getName());
- transformer.setParameter("provider_uri",record.getOrigin().getOaiUrl());
- transformer.setParameter("record_identifier",record.getId());
- transformer.transform(source, output);
- if (record.hasStream()) {
- byte[] bytes = ((ByteArrayOutputStream)((StreamResult)output).getOutputStream()).toByteArray();
- record.setStream(new ByteArrayInputStream(bytes));
- logger.debug("transformed to XML stream with ["+bytes.length+"] bytes");
- } else {
- record.setDoc((Document) ((DOMResult)output).getNode());
- logger.debug("transformed to XML doc with ["+XPathFactory.newInstance().newXPath().evaluate("count(//*)", record.getDoc())+"] nodes");
+ XdmNode old = Saxon.buildDocument(source);
+ XsltTransformer transformer = executable.load();
+
+ TransformActionListener listener = new TransformActionListener();
+ transformer.setErrorListener(listener);
+ transformer.setMessageListener(listener);
+
+ if (cacheDir != null) {
+ logger.debug("Setting the URLResolve to cache in "+cacheDir);
+ transformer.setURIResolver(new TransformActionURLResolver(transformer.getURIResolver()));
}
- } catch (TransformerException | XPathExpressionException ex) {
+
+ transformer.setSource(old.asSource());
+ transformer.setDestination(output);
+
+ transformer.setParameter(new QName("config"), Saxon.wrapNode(this.config.getOwnerDocument()));
+ transformer.setParameter(new QName("provider_name"), new XdmAtomicValue(record.getOrigin().getName()));
+ transformer.setParameter(new QName("provider_uri"), new XdmAtomicValue(record.getOrigin().getOaiUrl()));
+ transformer.setParameter(new QName("record_identifier"), new XdmAtomicValue(record.getId()));
+
+ transformer.transform();
+ record.setDoc(doc);
+ logger.debug("transformed to XML doc with ["+XPathFactory.newInstance().newXPath().evaluate("count(//*)", record.getDoc())+"] nodes");
+ } catch (XPathExpressionException | SaxonApiException | ParserConfigurationException ex) {
logger.error("Transformation error: ",ex);
return false;
} finally {
@@ -171,57 +183,32 @@ public boolean perform(List records) {
@Override
public String toString() {
- return "transform using " + xsltFile;
+ return "transform using " + xsltFile;
}
// Transform actions differ if and only if the XSLT files differ.
@Override
public int hashCode() {
- return xsltFile.hashCode();
+ return xsltFile.hashCode();
}
@Override
public boolean equals(Object o) {
- if (o instanceof TransformAction) {
- TransformAction t = (TransformAction)o;
- return xsltFile.equals(t.xsltFile);
- }
- return false;
+ if (o instanceof TransformAction) {
+ TransformAction t = (TransformAction)o;
+ return xsltFile.equals(t.xsltFile);
+ }
+ return false;
}
@Override
public Action clone() {
- try {
- // This is a deep copy. The new object has its own Transform object.
- return new TransformAction(config, xsltFile,cacheDir,semaphore);
- } catch (FileNotFoundException | TransformerConfigurationException ex) {
- logger.error(ex);
- }
- return null;
- }
-
- class TransformActionErrorListener implements ErrorListener {
-
- public TransformActionErrorListener() {
- logger.debug("Redirecting XSLT warnings and errors to this logger");
- }
-
- @Override
- public void warning(TransformerException te) throws TransformerException {
- logger.warn("Transformer warning: "+te.getMessageAndLocation());
- //logger.debug("Transformation warning stacktrace", te);
- }
-
- @Override
- public void error(TransformerException te) throws TransformerException {
- // errors will be caught by the service, so swallow here except in debug
- logger.debug("Transformer error", te);
- }
-
- @Override
- public void fatalError(TransformerException te) throws TransformerException {
- // errors will be caught by the service, so swallow here except in debug
- logger.debug("Transformer fatal error", te);
- }
+ try {
+ // This is a deep copy. The new object has its own Transform object.
+ return new TransformAction(config, xsltFile,cacheDir,semaphore);
+ } catch (FileNotFoundException | TransformerConfigurationException | MalformedURLException | SaxonApiException ex) {
+ logger.error(ex);
+ }
+ return null;
}
class TransformActionURLResolver implements URIResolver {
@@ -251,13 +238,69 @@ public Source resolve(String href, String base) throws TransformerException {
logger.debug("Transformer resolver: loaded "+cacheFile+" from cache");
} else {
res = resolver.resolve(href, base);
- Result result = new StreamResult(cacheDir.resolve(cacheFile).toFile());
- Transformer xformer = factory.newTransformer();
- xformer.transform(res, result);
- logger.debug("Transformer resolver: stored "+cacheFile+" in cache");
+ try {
+ Saxon.save(res, cacheDir.resolve(cacheFile).toFile());
+ logger.debug("Transformer resolver: stored "+cacheFile+" in cache");
+ } catch (SaxonApiException ex) {
+ throw new TransformerException(ex);
+ }
}
return res;
}
}
+ class TransformActionListener implements MessageListener, ErrorListener {
+
+ protected boolean handleMessage(String msg, String loc, Exception e) {
+ if (msg.startsWith("INF: "))
+ logger.info(msg.replace("INF: ", ""));
+ else if (msg.startsWith("WRN: "))
+ logger.warn("["+loc+"]: "+msg.replace("WRN: ", ""), e);
+ else if (msg.startsWith("ERR: "))
+ logger.error("["+loc+"]: "+msg.replace("ERR: ", ""), e);
+ else if (msg.startsWith("DBG: "))
+ logger.debug("["+loc+"]: "+msg.replace("DBG: ", ""), e);
+ else
+ return false;
+ return true;
+ }
+
+ protected boolean handleException(TransformerException te) {
+ return handleMessage(te.getMessage(), te.getLocationAsString(), te);
+ }
+
+ @Override
+ public void warning(TransformerException te) throws TransformerException {
+ if (!handleException(te))
+ logger.warn(te.getMessageAndLocation(), te);
+ }
+
+ @Override
+ public void error(TransformerException te) throws TransformerException {
+ if (!handleException(te))
+ logger.error(te.getMessageAndLocation(), te);
+ }
+
+ @Override
+ public void fatalError(TransformerException te) throws TransformerException {
+ if (!handleException(te))
+ logger.error(te.getMessageAndLocation(), te);
+ }
+
+ protected String getLocation(SourceLocator sl) {
+ if (sl.getColumnNumber()<0)
+ return "-1";
+ return sl.getSystemId()+":"+sl.getLineNumber()+":"+sl.getColumnNumber();
+ }
+
+ @Override
+ public void message(XdmNode xn, boolean bln, SourceLocator sl) {
+ if (!handleMessage(xn.getStringValue(),getLocation(sl),null)) {
+ if (bln)
+ logger.error("["+getLocation(sl)+"]: "+xn.getStringValue());
+ else
+ logger.info("["+getLocation(sl)+"]: "+xn.getStringValue());
+ }
+ }
+ }
}
diff --git a/src/main/java/nl/mpi/oai/harvester/control/Configuration.java b/src/main/java/nl/mpi/oai/harvester/control/Configuration.java
index b418b1b8..99fcade7 100644
--- a/src/main/java/nl/mpi/oai/harvester/control/Configuration.java
+++ b/src/main/java/nl/mpi/oai/harvester/control/Configuration.java
@@ -35,11 +35,13 @@
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.transform.TransformerConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
+
+import static org.mockito.Mockito.reset;
+
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
@@ -57,7 +59,6 @@
import java.util.Map;
import java.util.Optional;
import java.util.Set;
-import java.util.stream.Collectors;
/**
@@ -72,6 +73,7 @@ public class Configuration {
private static final Set DEFAULT_INCLUDE_SETS = ImmutableSet.of("*");
private final XPath xpath;
+ private RegistryReader registryReader = null;
/**
* Configuration options stored as key-value pairs.
@@ -113,7 +115,7 @@ public String toString() {
return val;
}
}
-
+
/**
* Map file
*/
@@ -306,7 +308,7 @@ private void parseActions(Node base) throws XPathExpressionException {
}
}
act = new TransformAction(base, xslFile, cache, jobs);
- } catch (IOException | TransformerConfigurationException ex) {
+ } catch (Exception ex) {
logger.error(ex);
}
}
@@ -332,6 +334,7 @@ private void parseActions(Node base) throws XPathExpressionException {
* @param base top node of the providers section
*/
private void parseProviders(Node base) throws
+ IOException,
XPathExpressionException,
MalformedURLException,
ParserConfigurationException {
@@ -412,10 +415,9 @@ private void parseProviders(Node base) throws
}
}
// get the list of endpoints from the centre registry
- final RegistryReader rr = new RegistryReader();
-
+ registryReader = new RegistryReader(new java.net.URL(rUrl));
final Map> endPointOaiPmhSetMap
- = rr.getEndPointOaiPmhSetMap(new java.net.URL(rUrl));
+ = registryReader.getEndPointOaiPmhSetMap();
// use the list to create the list of endpoints to harvest from
for (String provUrl : endPointOaiPmhSetMap.keySet()) {
@@ -478,8 +480,9 @@ private void parseProviders(Node base) throws
final String[] sets = includedSets.stream()
.map(CentreRegistrySetDefinition::getSetSpec)
.toArray(String[]::new);
-
- provider.setSets(sets);
+ if(sets.length > 0) {
+ provider.setSets(sets);
+ }
}
}
@@ -648,7 +651,7 @@ public String getMapFile() {
PrintWriter map = null;
try {
map = new PrintWriter(new FileWriter(mapFile,true));
- map.println("endpointUrl,directoryName");
+ map.println("endpointUrl,directoryName,centreName,nationalProject");
} catch (IOException e) {
logger.error("couldn't create an initial/default " + mapFile + " file: ", e);
} finally {
@@ -707,7 +710,10 @@ private void setTimeout(int sec) {
*/
public boolean isIncremental() {
String s = settings.get(KnownOptions.INCREMENTAL.toString());
- return (s == null) ? false : Boolean.valueOf(s);
+ boolean r = (s == null) ? false : Boolean.valueOf(s);
+ if (r)
+ logger.warn("Incremental harvesting cannot be enabled ... needs to be finished!");
+ return false;
}
/**
@@ -725,6 +731,21 @@ public String getScenario() {
String s = settings.get(KnownOptions.SCENARIO.toString());
return (s == null) ? "ListIndentifiers" : s;
}
+
+ /**
+ * Get Registry Reader
+ */
+ public RegistryReader getRegistryReader() {
+ return this.registryReader;
+ }
+
+ /**
+ * Has a Registry Reader?
+ */
+ public boolean hasRegistryReader() {
+ return (this.registryReader!=null);
+ }
+
/**
* Log parsed contents of the configuration.
diff --git a/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java b/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java
index 4562f95a..f6cf9d07 100644
--- a/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java
+++ b/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java
@@ -102,7 +102,7 @@ private static void runSynchronizationForNoDeletionMode(final Provider provider)
int retryDelay = provider.getRetryDelay(counter);
if (retryDelay > 0) {
try {
- Thread.sleep(retryDelay);
+ Thread.sleep(retryDelay*1000);
} catch (InterruptedException e) {
logger.error(e.getMessage(), e);
}
diff --git a/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java b/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java
index 8a9c7d5e..4fb92d84 100644
--- a/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java
+++ b/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java
@@ -15,259 +15,193 @@
* LICENSE-gpl-3.0.txt. If that file is missing, see
* .
*/
-
package nl.mpi.oai.harvester.control;
-import nl.mpi.oai.harvester.metadata.NSContext;
+import static com.jayway.jsonpath.Criteria.where;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.Filter;
+import static com.jayway.jsonpath.Filter.filter;
+import com.jayway.jsonpath.JsonPath;
+import static com.jayway.jsonpath.JsonPath.parse;
+import com.jayway.jsonpath.Option;
+import nl.mpi.oai.harvester.Provider;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
-import org.w3c.dom.DOMException;
-import org.w3c.dom.Document;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.xpath.XPath;
-import javax.xml.xpath.XPathConstants;
-import javax.xml.xpath.XPathExpressionException;
-import javax.xml.xpath.XPathFactory;
+import java.io.BufferedReader;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
-import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* This class reads information from the REST service of the CLARIN Centre
- * Registry (see http://www.clarin.eu/content/centres for more information).
+ * Registry (see http://www.clarin.eu/content/centres for more information).
*
* @author Lari Lampen (MPI-PL)
*/
public class RegistryReader {
+
private static final Logger logger = LogManager.getLogger(RegistryReader.class);
- private final XPath xpath;
+ private static URL registryUrl = null;
- /** Create a new registry reader object. */
- public RegistryReader() {
- XPathFactory xpf = XPathFactory.newInstance();
- xpath = xpf.newXPath();
- NSContext nsContext = new NSContext();
- nsContext.add("cmd", "http://www.clarin.eu/cmd/");
- xpath.setNamespaceContext(nsContext);
- }
+ private static final Map modelCache = new HashMap<>();
+
+ //JsonPath configuration
+ private static com.jayway.jsonpath.Configuration conf = com.jayway.jsonpath.Configuration.defaultConfiguration();
/**
- * Get a list of all OAI-PMH endpoint URLs defined in the
- * specified registry.
- *
- * @param registryUrl url of the registry endpoint
- * @return list of all OAI-PMH endpoint URLs
+ * Create a new registry reader object.
*/
- public List getEndpoints(URL registryUrl) {
- // Basically this makes a simple REST call to get a list of
- // addresses for a further batch of REST calls. This is not
- // documented in detail since it's specific to the CLARIN
- // registry implementation anyway.
- List endpoints = new ArrayList<>();
- try {
- Document doc = openRemoteDocument(registryUrl);
- List provUrls = getProviderInfoUrls(doc);
-
- logger.info("Fetching information on " + provUrls.size()
- + " centres");
- for (String centreInfoUrl : provUrls) {
- doc = openRemoteDocument(new URL(centreInfoUrl));
- NodeList ends = getEndpoints(doc);
- if (ends != null) {
- for (int i =0;i> getEndPointOaiPmhSetMap(URL registryUrl) {
- // Basically this makes a simple REST call to get a list of
- // addresses for a further batch of REST calls. This is not
- // documented in detail since it's specific to the CLARIN
- // registry implementation anyway.
- final Map> map = new HashMap<>();
- try {
- final Document centresDoc = openRemoteDocument(registryUrl);
- final List provUrls = getProviderInfoUrls(centresDoc);
-
- logger.info("Fetching information on " + provUrls.size()
- + " centres");
-
- for (String centreInfoUrl : provUrls) {
- final Document centreDoc = openRemoteDocument(new URL(centreInfoUrl));
- final NodeList endpointsList = getEndpoints(centreDoc);
- if (endpointsList != null) {
- for (int i =0;i sets = getOaiPmhSetsForEndpoint(centreDoc, endpoint);
- map.put(endpoint, sets);
- }
- }
- }
- } catch (IOException | ParserConfigurationException | SAXException
- | XPathExpressionException | DOMException e) {
- logger.error("Error reading from centre registry", e);
- }
- return map;
+ public RegistryReader(URL url) {
+ this.registryUrl = url;
+
+ conf.addOptions(Option.ALWAYS_RETURN_LIST,Option.SUPPRESS_EXCEPTIONS);
}
- /**
- * Extract links to all provider information pages from the summary
- * document returned by the centre registry
- *
- * @param doc center registry cycle response
- * @return list of URLs of provider-specific info pages
- * @throws XPathExpressionException problem with the paths to query the center registry response
- */
- public List getProviderInfoUrls(Document doc) throws XPathExpressionException {
- if (doc == null) {
- logger.warn("The centre registry response is missing");
- return Collections.emptyList();
- }
+ private HttpURLConnection getConnection(URL url, String contentType) throws IOException {
+ HttpURLConnection connection = null;
+ Boolean redirect = false;
- NodeList centres = (NodeList) xpath.evaluate("/Centers/CenterProfile/Center_id_link/text()",
- doc.getDocumentElement(), XPathConstants.NODESET);
- List provUrls = new ArrayList<>();
- for (int j=0; j getOaiPmhSetsForEndpoint(final Document centreDoc, final String endpoint) throws XPathExpressionException, DOMException {
- Set sets = new HashSet<>();
- final NodeList setList = getOaiPmhSets(centreDoc, endpoint);
- if(setList == null) {
- logger.debug("No set list for endpoint {}", endpoint);
+ private DocumentContext getModel(String model) throws IOException {
+ DocumentContext res = null;
+ if (modelCache.containsKey(model)) {
+ res = modelCache.get(model);
} else {
- for(int s=0;s getEndpoints() throws IOException {
+ DocumentContext model = getModel("OAIPMHEndpoint");
+ List endpoints = model.read("$..uri");
+ logger.info("Found " + endpoints.size() + " endpoints");
+ return endpoints;
+ }
- NodeList sets = (NodeList) xpath.evaluate("/cmd:CMD/cmd:Components/cmd:CenterProfile/cmd:CenterExtendedInformation/cmd:Metadata[cmd:OaiAccessPoint='" + endpoint +"']/cmd:OaiPmhSets/cmd:Set",
- providerInfo.getDocumentElement(), XPathConstants.NODESET);
- return sets;
+ private List