diff --git a/.gitignore b/.gitignore index dfcef86c..8c4ab2c7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,9 @@ run/ nb-configuration.xml nbactions.xml +.classpath +.factorypath +.project +.settings +.vscode +.DS_Store diff --git a/.travis.yml b/.travis.yml index 9faa181d..2e9b4f53 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ sudo: false language: java jdk: -- openjdk8 +- openjdk11 diff --git a/README.md b/README.md index 42c7ba51..c49ca89a 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,10 @@ OAI-PMH **endpoint**. # Building -Building this app requires JDK 8 and Apache Maven. It can be built +Building this app requires JDK 11 and Apache Maven. It can be built simply using the command: -```mvn clean package assembly:assembly``` +```mvn clean install``` If you use a Java IDE, it is highly likely it also offers a simple way to do the above. @@ -37,7 +37,7 @@ You can also use the `build.sh` script to run a build within an environment provisioned with suitable versions of the JDK and Maven. Requires docker. The above build process creates a package named -`oai-harvest-manager-x.y.z.tar.gz` (where x.y.z is a version number). +`target/oai-harvest-manager-x.y.z.tar.gz` (where x.y.z is a version number). # Running the Application @@ -60,6 +60,8 @@ override the timeout value defined in `config.xml`, if any. The first parameter that does not contain = is taken as the configuration file name. +If you used `build.sh` to run a build you can use `run.sh config.xml` to run this build + # Configuration @@ -77,8 +79,9 @@ file. The configuration file is composed of four sections: listed. To get a clear idea of the structure of the configuration file, see -the [sample configuration files](src/main/resources) in juxtaposition -with the explanation for each section below. +the [sample configuration files](src/main/resources) or the +[CLARIN configuration files](https://github.com/clarin-eric/oai-harvest-config) in +juxtaposition with the explanation for each section below. ## Configuring Settings @@ -142,7 +145,11 @@ action types are available: - The *transform* action applies a mapping, defined in an XSLT file, to the metadata record. This can be used, among other things, for semantic mapping between metadata schemata. See the included - configuration files for an example. + configuration files for an example. The XSLT recieves various parameters: + 1. ```config``` the configuration file used + 2. ```provider_name``` the provider name + 3. ```provider_uri``` the endpoint + 4. ```record_identifier``` the id of the record to transform For each provider, the first format definition that the provider supports will determine the action sequence to be executed. If one of @@ -174,6 +181,10 @@ For each provider, the following can be defined: delay and timeout) can be overwritten for a specific provider by adding them as attributes to the provider element. +- The attribute *exclusive*, when set to true, indicates that the + provider should be harvested on its own, i.e. no other harvesting threads + should be active, this can be used when a provider has some huge records. + - The provider element may contain multiple *set* child elements, which specify the names of OAI-PMH sets to be harvested. @@ -182,8 +193,10 @@ a *centre registry*. So far, this registry is only used by the CLARIN community. The registry is specified by its URL. All the provider endpoints defined in the registry will be harvested. Sometimes, it might be necessary to exclude an endpoint from the ones defined in the registry. This can be done by specifying -its URL in the configuration file used for harvesting. Please review the -instructions in the configuration files supplied in the package. +its URL in the configuration file used for harvesting. In other cases +an endpoint loaded from the registry needs its specific configuration timeout, +this can be done in a similar vain as excluding. Please review the +instructions in the configuration files supplied in the package. # Static Providers @@ -222,10 +235,6 @@ convenient for debugging specific providers. # Implementation Notes -Saxon is used as the XPath engine, although only standard APIs are -used and hence changing to a different XPath processor would be -trivial. - Processing for each provider runs in a separate thread. It is not possible to target a single provider with multiple threads (except in the special case where sets are used; then it is possible to mention @@ -259,9 +268,4 @@ action actionSequences, and 5 each for the directories ```cmdi``` and The pooling implementation is particularly important when transformations are used, as preparing a transformation object -involves parsing the XSLT, potentially a time-consuming process. - - -# Build Status - -[![Build Status](https://travis-ci.org/TheLanguageArchive/oai-harvest-manager.png?branch=master)](https://travis-ci.org/TheLanguageArchive/oai-harvest-manager) +involves parsing the XSLT, potentially a time-consuming process. \ No newline at end of file diff --git a/assembly.xml b/assembly.xml index 72f0b322..a7b47641 100644 --- a/assembly.xml +++ b/assembly.xml @@ -12,11 +12,9 @@ - config*.xml - oai2.xsl + config*.xml addOAISetName.xsl - olac2cmdi.xsl - sil_to_iso6393.xml + filter.xsl - + + + + CLARIN + CLARIN Repository + https://nexus.clarin.eu/content/repositories/Clarin + + false + + + + CLARIN-Snapshot + CLARIN Snapshot Repository + https://nexus.clarin.eu/content/repositories/clarin-snapshot + + true + + + - - ibiblio - ibiblio.org - http://mirrors.ibiblio.org/pub/mirrors/maven2 - - + + - - - - org.apache.logging.log4j - log4j-slf4j-impl - 2.13.1 - + + org.glassfish.jaxb + codemodel + 3.0.0-M4 + + + org.glassfish.jaxb + txw2 + 3.0.0-M4 + + + com.sun.xsom + xsom + 20140925 + + + com.sun.istack + 4.0.0-M3 + istack-commons-runtime + + + com.sun.xml.bind.external + rngom + 3.0.0-M4 + - - net.sf.saxon - Saxon-HE - 9.5.1-8 - - - xalan - xalan - 2.7.2 - + + org.apache.logging.log4j + log4j-slf4j-impl + 2.13.3 + + + org.slf4j + log4j-over-slf4j + 1.7.30 + - - junit - junit - 4.13 - test - + + net.sf.saxon + Saxon-HE + 9.5.1-8 + - - joda-time - joda-time - 2.2 - + + nl.mpi.tla + SaxonUtils + 1.0-SNAPSHOT + jar + - - commons-io - commons-io - 2.5 - + + xalan + xalan + 2.7.2 + - - org.mockito - mockito-all - 1.9.5 - + + junit + junit + 4.13.1 + test + - - org.codehaus.woodstox - woodstox-core-asl - 4.2.0 - jar - + + joda-time + joda-time + 2.10.6 + + + + commons-io + commons-io + 2.7 + + + + org.mockito + mockito-core + 3.4.6 + + + + + org.codehaus.woodstox + woodstox-core-asl + 4.4.1 + jar + - - com.github.tomakehurst - wiremock-jre8 - 2.24.1 - test - - - com.google.guava - guava - 28.2-jre - - + + com.jayway.jsonpath + json-path + 2.4.0 + + + + com.github.tomakehurst + wiremock-jre8 + 2.27.1 + test + + + com.google.guava + guava + 29.0-jre + + + + + javax.annotation + javax.annotation-api + 1.3.2 + + + javax.xml.bind + jaxb-api + 2.4.0-b180725.0427 + + + org.glassfish.jaxb + jaxb-runtime + 2.4.0-b180725.0644 + + + + + + + ${project.artifactId}-${versionNumber} + + + + src/test/resources + + ** + + + - - - ${project.artifactId}-${versionNumber} - - - - src/test/resources - - ** - - - + + + + + maven-deploy-plugin + 3.0.0-M1 + + true + + + + - - - - - maven-deploy-plugin - - true - - - - + - + + org.apache.maven.plugins + maven-enforcer-plugin + 3.0.0-M3 + + + enforce-maven + + enforce + + + + + 3.0.5 + + + + + + - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.8 - 1.8 - - + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + + - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.3 - - src/main/java;target/generated-sources/jaxb - - + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.2.0 + + src/main/java;target/generated-sources/jaxb + + - - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - package - - copy-dependencies - - - target/lib - - true - true - - - - - + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + target/lib + true + true + + + + - - - org.apache.maven.plugins - maven-jar-plugin - 2.5 - - target/classes - - - lib/ - true - nl.mpi.oai.harvester.control.Main - - - ${user.name} - ${maven.build.timestamp} - ${project.version} - ${buildNumber} - ${project.artifactId}-${versionNumber}.jar - . - - - - **/log4j.properties - - - + + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + target/classes + + + lib/ + true + false + nl.mpi.oai.harvester.control.Main + + + ${user.name} + ${maven.build.timestamp} + ${project.version} + ${buildNumber} + ${project.artifactId}-${versionNumber}.jar + . + + + + **/log4j.properties + + + - - - org.apache.maven.plugins - maven-assembly-plugin - 2.4 - - ${project.artifactId}-${versionNumber} - false - - assembly.xml - - - + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + ${project.artifactId}-${versionNumber} + false + + assembly.xml + + + + + make-assembly + package + + single + + + + - - - org.codehaus.mojo - buildnumber-maven-plugin - 1.2 - - - validate - - create - - - none - 6 - - - - - false - false - - + + + org.codehaus.mojo + buildnumber-maven-plugin + 1.4 + + + validate + + create + + + none + 6 + + + + + false + false + + - - - org.codehaus.mojo - jaxb2-maven-plugin - - 2.4 - - - xjc - - xjc - - - - - - nl.mpi.oai.harvester.generated - - - + + + org.codehaus.mojo + jaxb2-maven-plugin + + 2.5.0 + + + xjc + + xjc + + + + + nl.mpi.oai.harvester.generated + + - - - org.codehaus.mojo - findbugs-maven-plugin - 3.0.0 - - Normal - Min - - - - + + + org.codehaus.mojo + findbugs-maven-plugin + 3.0.5 + + Normal + Min + + + + - - - + + + - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.3 - - - + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.2.0 + + + diff --git a/run.sh b/run.sh index e6e5aa12..7f177dfb 100755 --- a/run.sh +++ b/run.sh @@ -1,8 +1,8 @@ #!/bin/bash JAVA_TARGET_DIR="$(cd "$(dirname $0)" && pwd)/target" -JAVA_IMAGE=registry.gitlab.com/clarin-eric/docker-alpine-supervisor-java-base:openjdk8-1.2.5 +JAVA_IMAGE=registry.gitlab.com/clarin-eric/docker-alpine-supervisor-java-base:openjdk11-1.2.12 CONTAINER_CONF_FILE_PATH='/tmp/harvester.conf' -JAVA_CMD="java -Dlogdir=/logdir -jar /java-bin/oai-harvest-manager*.jar ${CONTAINER_CONF_FILE_PATH}" +JAVA_CMD="java -Dlogdir=/logdir -jar /java-bin/oai-harvest-manager*.jar workdir=/workdir ${CONTAINER_CONF_FILE_PATH}" WORKDIR="${WORKDIR:-$(pwd)/run/workdir}" LOGDIR="${LOGDIR:-$(pwd)/run/log}" CONFIG_FILE="$1" diff --git a/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java b/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java index 09b1f9ca..68587b34 100644 --- a/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java +++ b/src/main/java/ORG/oclc/oai/harvester2/verb/HarvesterVerb.java @@ -326,7 +326,7 @@ public void harvest(String requestURL, int timeout, Path temp) throws MalformedU retrySeconds = retryDate - now; } if (retrySeconds == 0) { // Apparently, it's a bad URL - throw new FileNotFoundException("Bad URL?"); + throw new FileNotFoundException("Bad URL["+requestURL+"]?"); } logger.debug("Retry-After=" + retrySeconds); if (retrySeconds > 0) { diff --git a/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java b/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java index 70e8652f..f1bf7938 100644 --- a/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java +++ b/src/main/java/ORG/oclc/oai/harvester2/verb/ListRecords.java @@ -18,6 +18,7 @@ package ORG.oclc.oai.harvester2.verb; +import com.ctc.wstx.exc.WstxUnexpectedCharException; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; @@ -150,16 +151,22 @@ public String getResumptionToken() } break; } + + outer: if (xmlr.hasNext()) - xmlr.next(); + try { + xmlr.next(); + } catch (WstxUnexpectedCharException ex) { + logger.info(String.format("Invalid char found in XML, skipping the current one and look for next one: {%s}", xmlr.toString())); + } else state = state == 1? 0: -1;// if START then STOP else ERROR } if (state < 0 || token == null) { - logger.warn("couldn't find token in the XML stream!"); + logger.debug("couldn't find token, done!"); return null; } - logger.debug("found token["+token+"] in the XML stream!"); + logger.debug("found token["+token+"], resume!"); return token; } else if (schemaLocation.indexOf(SCHEMA_LOCATION_V1_1_LIST_RECORDS) != -1) { return getSingleString("/oai11_ListRecords:ListRecords/oai11_ListRecords:resumptionToken"); diff --git a/src/main/java/nl/mpi/oai/harvester/Provider.java b/src/main/java/nl/mpi/oai/harvester/Provider.java index 5aed22be..87b649c7 100644 --- a/src/main/java/nl/mpi/oai/harvester/Provider.java +++ b/src/main/java/nl/mpi/oai/harvester/Provider.java @@ -196,7 +196,7 @@ public void setDeletionMode(DeletionMode deletionMode) { this.deletionMode = deletionMode; } - /** + /** * Set the name of this provider * * @param name name of provider @@ -223,6 +223,10 @@ public String getOaiUrl() { return oaiUrl; } + public boolean hasSets() { + return (sets!=null && 0<=sets.length); + } + public String[] getSets() { return sets; } @@ -415,7 +419,7 @@ public Metadata getRecord(String id, String mdPrefix) { int retryDelay = getRetryDelay(i); if (retryDelay > 0) { try { - Thread.sleep(retryDelay); + Thread.sleep(retryDelay*1000); } catch(InterruptedException e) { logger.error(e.getMessage(), e); } diff --git a/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java b/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java index dc70a3d8..2eb0017e 100644 --- a/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java +++ b/src/main/java/nl/mpi/oai/harvester/action/TransformAction.java @@ -18,23 +18,31 @@ package nl.mpi.oai.harvester.action; -import net.sf.saxon.Configuration; -import net.sf.saxon.TransformerFactoryImpl; +import net.sf.saxon.s9api.DOMDestination; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.XdmAtomicValue; +import net.sf.saxon.s9api.XdmNode; +import net.sf.saxon.s9api.XsltTransformer; import nl.mpi.oai.harvester.metadata.Metadata; +import nl.mpi.tla.util.Saxon; +import nl.mpi.tla.util.SaxonListener; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.w3c.dom.Document; +import org.w3c.dom.Node; -import javax.xml.transform.*; -import javax.xml.transform.dom.DOMResult; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Source; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.URIResolver; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.sax.SAXSource; -import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.net.MalformedURLException; @@ -43,7 +51,10 @@ import java.nio.file.Path; import java.util.List; import java.util.concurrent.Semaphore; -import org.w3c.dom.Node; +import javax.xml.transform.ErrorListener; +import javax.xml.transform.SourceLocator; +import net.sf.saxon.s9api.MessageListener; +import net.sf.saxon.s9api.XsltExecutable; /** * This class represents the application of an XSL transformation to the @@ -53,6 +64,9 @@ */ public class TransformAction implements Action { private static final Logger logger = LogManager.getLogger(TransformAction.class); + + /** The XSL executable. */ + private final XsltExecutable executable; /** The file containing the XSL transformation. */ private String xsltFile; @@ -60,12 +74,6 @@ public class TransformAction implements Action { /** The directory containing cached resources. */ private Path cacheDir; - /** Transformer factory */ - TransformerFactory factory; - - /** Prepared XSL transformation object. */ - private Templates templates; - /** A standard semaphore is used to track the number of running transforms. */ private Semaphore semaphore; @@ -80,8 +88,11 @@ public class TransformAction implements Action { * @param maxJobs the maximum number of concurrent transforms * @throws FileNotFoundException stylesheet couldn't be found * @throws TransformerConfigurationException there is a problem with the stylesheet + * @throws java.net.MalformedURLException + * @throws net.sf.saxon.s9api.SaxonApiException */ - public TransformAction(Node conf, String xsltFile,Path cacheDir,int maxJobs) throws FileNotFoundException, TransformerConfigurationException { + public TransformAction(Node conf, String xsltFile,Path cacheDir,int maxJobs) + throws FileNotFoundException, TransformerConfigurationException, MalformedURLException, SaxonApiException { this(conf, xsltFile,cacheDir,(maxJobs>0?new Semaphore(maxJobs):null)); } @@ -93,30 +104,23 @@ public TransformAction(Node conf, String xsltFile,Path cacheDir,int maxJobs) thr * @param semaphore a semaphore to control the concurrent number of transforms * @throws FileNotFoundException stylesheet couldn't be found * @throws TransformerConfigurationException there is a problem with the stylesheet + * @throws java.net.MalformedURLException + * @throws net.sf.saxon.s9api.SaxonApiException */ - public TransformAction(Node conf, String xsltFile,Path cacheDir,Semaphore semaphore) throws FileNotFoundException, TransformerConfigurationException { + public TransformAction(Node conf, String xsltFile,Path cacheDir,Semaphore semaphore) + throws FileNotFoundException, TransformerConfigurationException, MalformedURLException, SaxonApiException { this.config = conf; - this.xsltFile = xsltFile; + this.xsltFile = xsltFile; this.cacheDir = cacheDir; this.semaphore = semaphore; - factory = TransformerFactory.newInstance("net.sf.saxon.TransformerFactoryImpl", null); - if(factory instanceof TransformerFactoryImpl) { - TransformerFactoryImpl transformerFactoryImpl = ((TransformerFactoryImpl)factory); - logger.debug("Telling Saxon to send messages as warnings to logger"); - final Configuration tfConfig = transformerFactoryImpl.getConfiguration(); - tfConfig.setMessageEmitterClass("net.sf.saxon.serialize.MessageWarner"); - if (cacheDir != null) { - logger.debug("Setting the URLResolve to cache in "+cacheDir); - transformerFactoryImpl.setURIResolver(new TransformActionURLResolver(transformerFactoryImpl.getURIResolver())); - } - } - factory.setErrorListener(new TransformActionErrorListener()); Source xslSource = null; if (xsltFile.startsWith("http:") || xsltFile.startsWith("https:")) xslSource = new StreamSource(xsltFile); else xslSource = new StreamSource(new FileInputStream(xsltFile),xsltFile); - templates = factory.newTemplates(xslSource); + + executable = Saxon.buildTransformer(Saxon.buildDocument(xslSource)); + } @Override @@ -133,30 +137,38 @@ public boolean perform(List records) { } catch (InterruptedException e) { } } } - Transformer transformer = templates.newTransformer(); Source source = null; - Result output = null; + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); + DOMDestination output = new DOMDestination(doc); if (record.hasStream()) { source = new SAXSource(record.getSource()); - output = new StreamResult(new ByteArrayOutputStream()); } else { source = new DOMSource(record.getDoc()); - output = new DOMResult(); } - transformer.setParameter("config", this.config.getOwnerDocument()); - transformer.setParameter("provider_name",record.getOrigin().getName()); - transformer.setParameter("provider_uri",record.getOrigin().getOaiUrl()); - transformer.setParameter("record_identifier",record.getId()); - transformer.transform(source, output); - if (record.hasStream()) { - byte[] bytes = ((ByteArrayOutputStream)((StreamResult)output).getOutputStream()).toByteArray(); - record.setStream(new ByteArrayInputStream(bytes)); - logger.debug("transformed to XML stream with ["+bytes.length+"] bytes"); - } else { - record.setDoc((Document) ((DOMResult)output).getNode()); - logger.debug("transformed to XML doc with ["+XPathFactory.newInstance().newXPath().evaluate("count(//*)", record.getDoc())+"] nodes"); + XdmNode old = Saxon.buildDocument(source); + XsltTransformer transformer = executable.load(); + + TransformActionListener listener = new TransformActionListener(); + transformer.setErrorListener(listener); + transformer.setMessageListener(listener); + + if (cacheDir != null) { + logger.debug("Setting the URLResolve to cache in "+cacheDir); + transformer.setURIResolver(new TransformActionURLResolver(transformer.getURIResolver())); } - } catch (TransformerException | XPathExpressionException ex) { + + transformer.setSource(old.asSource()); + transformer.setDestination(output); + + transformer.setParameter(new QName("config"), Saxon.wrapNode(this.config.getOwnerDocument())); + transformer.setParameter(new QName("provider_name"), new XdmAtomicValue(record.getOrigin().getName())); + transformer.setParameter(new QName("provider_uri"), new XdmAtomicValue(record.getOrigin().getOaiUrl())); + transformer.setParameter(new QName("record_identifier"), new XdmAtomicValue(record.getId())); + + transformer.transform(); + record.setDoc(doc); + logger.debug("transformed to XML doc with ["+XPathFactory.newInstance().newXPath().evaluate("count(//*)", record.getDoc())+"] nodes"); + } catch (XPathExpressionException | SaxonApiException | ParserConfigurationException ex) { logger.error("Transformation error: ",ex); return false; } finally { @@ -171,57 +183,32 @@ public boolean perform(List records) { @Override public String toString() { - return "transform using " + xsltFile; + return "transform using " + xsltFile; } // Transform actions differ if and only if the XSLT files differ. @Override public int hashCode() { - return xsltFile.hashCode(); + return xsltFile.hashCode(); } @Override public boolean equals(Object o) { - if (o instanceof TransformAction) { - TransformAction t = (TransformAction)o; - return xsltFile.equals(t.xsltFile); - } - return false; + if (o instanceof TransformAction) { + TransformAction t = (TransformAction)o; + return xsltFile.equals(t.xsltFile); + } + return false; } @Override public Action clone() { - try { - // This is a deep copy. The new object has its own Transform object. - return new TransformAction(config, xsltFile,cacheDir,semaphore); - } catch (FileNotFoundException | TransformerConfigurationException ex) { - logger.error(ex); - } - return null; - } - - class TransformActionErrorListener implements ErrorListener { - - public TransformActionErrorListener() { - logger.debug("Redirecting XSLT warnings and errors to this logger"); - } - - @Override - public void warning(TransformerException te) throws TransformerException { - logger.warn("Transformer warning: "+te.getMessageAndLocation()); - //logger.debug("Transformation warning stacktrace", te); - } - - @Override - public void error(TransformerException te) throws TransformerException { - // errors will be caught by the service, so swallow here except in debug - logger.debug("Transformer error", te); - } - - @Override - public void fatalError(TransformerException te) throws TransformerException { - // errors will be caught by the service, so swallow here except in debug - logger.debug("Transformer fatal error", te); - } + try { + // This is a deep copy. The new object has its own Transform object. + return new TransformAction(config, xsltFile,cacheDir,semaphore); + } catch (FileNotFoundException | TransformerConfigurationException | MalformedURLException | SaxonApiException ex) { + logger.error(ex); + } + return null; } class TransformActionURLResolver implements URIResolver { @@ -251,13 +238,69 @@ public Source resolve(String href, String base) throws TransformerException { logger.debug("Transformer resolver: loaded "+cacheFile+" from cache"); } else { res = resolver.resolve(href, base); - Result result = new StreamResult(cacheDir.resolve(cacheFile).toFile()); - Transformer xformer = factory.newTransformer(); - xformer.transform(res, result); - logger.debug("Transformer resolver: stored "+cacheFile+" in cache"); + try { + Saxon.save(res, cacheDir.resolve(cacheFile).toFile()); + logger.debug("Transformer resolver: stored "+cacheFile+" in cache"); + } catch (SaxonApiException ex) { + throw new TransformerException(ex); + } } return res; } } + class TransformActionListener implements MessageListener, ErrorListener { + + protected boolean handleMessage(String msg, String loc, Exception e) { + if (msg.startsWith("INF: ")) + logger.info(msg.replace("INF: ", "")); + else if (msg.startsWith("WRN: ")) + logger.warn("["+loc+"]: "+msg.replace("WRN: ", ""), e); + else if (msg.startsWith("ERR: ")) + logger.error("["+loc+"]: "+msg.replace("ERR: ", ""), e); + else if (msg.startsWith("DBG: ")) + logger.debug("["+loc+"]: "+msg.replace("DBG: ", ""), e); + else + return false; + return true; + } + + protected boolean handleException(TransformerException te) { + return handleMessage(te.getMessage(), te.getLocationAsString(), te); + } + + @Override + public void warning(TransformerException te) throws TransformerException { + if (!handleException(te)) + logger.warn(te.getMessageAndLocation(), te); + } + + @Override + public void error(TransformerException te) throws TransformerException { + if (!handleException(te)) + logger.error(te.getMessageAndLocation(), te); + } + + @Override + public void fatalError(TransformerException te) throws TransformerException { + if (!handleException(te)) + logger.error(te.getMessageAndLocation(), te); + } + + protected String getLocation(SourceLocator sl) { + if (sl.getColumnNumber()<0) + return "-1"; + return sl.getSystemId()+":"+sl.getLineNumber()+":"+sl.getColumnNumber(); + } + + @Override + public void message(XdmNode xn, boolean bln, SourceLocator sl) { + if (!handleMessage(xn.getStringValue(),getLocation(sl),null)) { + if (bln) + logger.error("["+getLocation(sl)+"]: "+xn.getStringValue()); + else + logger.info("["+getLocation(sl)+"]: "+xn.getStringValue()); + } + } + } } diff --git a/src/main/java/nl/mpi/oai/harvester/control/Configuration.java b/src/main/java/nl/mpi/oai/harvester/control/Configuration.java index b418b1b8..99fcade7 100644 --- a/src/main/java/nl/mpi/oai/harvester/control/Configuration.java +++ b/src/main/java/nl/mpi/oai/harvester/control/Configuration.java @@ -35,11 +35,13 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.TransformerConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; + +import static org.mockito.Mockito.reset; + import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; @@ -57,7 +59,6 @@ import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; /** @@ -72,6 +73,7 @@ public class Configuration { private static final Set DEFAULT_INCLUDE_SETS = ImmutableSet.of("*"); private final XPath xpath; + private RegistryReader registryReader = null; /** * Configuration options stored as key-value pairs. @@ -113,7 +115,7 @@ public String toString() { return val; } } - + /** * Map file */ @@ -306,7 +308,7 @@ private void parseActions(Node base) throws XPathExpressionException { } } act = new TransformAction(base, xslFile, cache, jobs); - } catch (IOException | TransformerConfigurationException ex) { + } catch (Exception ex) { logger.error(ex); } } @@ -332,6 +334,7 @@ private void parseActions(Node base) throws XPathExpressionException { * @param base top node of the providers section */ private void parseProviders(Node base) throws + IOException, XPathExpressionException, MalformedURLException, ParserConfigurationException { @@ -412,10 +415,9 @@ private void parseProviders(Node base) throws } } // get the list of endpoints from the centre registry - final RegistryReader rr = new RegistryReader(); - + registryReader = new RegistryReader(new java.net.URL(rUrl)); final Map> endPointOaiPmhSetMap - = rr.getEndPointOaiPmhSetMap(new java.net.URL(rUrl)); + = registryReader.getEndPointOaiPmhSetMap(); // use the list to create the list of endpoints to harvest from for (String provUrl : endPointOaiPmhSetMap.keySet()) { @@ -478,8 +480,9 @@ private void parseProviders(Node base) throws final String[] sets = includedSets.stream() .map(CentreRegistrySetDefinition::getSetSpec) .toArray(String[]::new); - - provider.setSets(sets); + if(sets.length > 0) { + provider.setSets(sets); + } } } @@ -648,7 +651,7 @@ public String getMapFile() { PrintWriter map = null; try { map = new PrintWriter(new FileWriter(mapFile,true)); - map.println("endpointUrl,directoryName"); + map.println("endpointUrl,directoryName,centreName,nationalProject"); } catch (IOException e) { logger.error("couldn't create an initial/default " + mapFile + " file: ", e); } finally { @@ -707,7 +710,10 @@ private void setTimeout(int sec) { */ public boolean isIncremental() { String s = settings.get(KnownOptions.INCREMENTAL.toString()); - return (s == null) ? false : Boolean.valueOf(s); + boolean r = (s == null) ? false : Boolean.valueOf(s); + if (r) + logger.warn("Incremental harvesting cannot be enabled ... needs to be finished!"); + return false; } /** @@ -725,6 +731,21 @@ public String getScenario() { String s = settings.get(KnownOptions.SCENARIO.toString()); return (s == null) ? "ListIndentifiers" : s; } + + /** + * Get Registry Reader + */ + public RegistryReader getRegistryReader() { + return this.registryReader; + } + + /** + * Has a Registry Reader? + */ + public boolean hasRegistryReader() { + return (this.registryReader!=null); + } + /** * Log parsed contents of the configuration. diff --git a/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java b/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java index 4562f95a..f6cf9d07 100644 --- a/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java +++ b/src/main/java/nl/mpi/oai/harvester/control/FileSynchronization.java @@ -102,7 +102,7 @@ private static void runSynchronizationForNoDeletionMode(final Provider provider) int retryDelay = provider.getRetryDelay(counter); if (retryDelay > 0) { try { - Thread.sleep(retryDelay); + Thread.sleep(retryDelay*1000); } catch (InterruptedException e) { logger.error(e.getMessage(), e); } diff --git a/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java b/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java index 8a9c7d5e..4fb92d84 100644 --- a/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java +++ b/src/main/java/nl/mpi/oai/harvester/control/RegistryReader.java @@ -15,259 +15,193 @@ * LICENSE-gpl-3.0.txt. If that file is missing, see * . */ - package nl.mpi.oai.harvester.control; -import nl.mpi.oai.harvester.metadata.NSContext; +import static com.jayway.jsonpath.Criteria.where; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.Filter; +import static com.jayway.jsonpath.Filter.filter; +import com.jayway.jsonpath.JsonPath; +import static com.jayway.jsonpath.JsonPath.parse; +import com.jayway.jsonpath.Option; +import nl.mpi.oai.harvester.Provider; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.w3c.dom.DOMException; -import org.w3c.dom.Document; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpressionException; -import javax.xml.xpath.XPathFactory; +import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.PrintWriter; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * This class reads information from the REST service of the CLARIN Centre - * Registry (see http://www.clarin.eu/content/centres for more information). + * Registry (see http://www.clarin.eu/content/centres for more information). * * @author Lari Lampen (MPI-PL) */ public class RegistryReader { + private static final Logger logger = LogManager.getLogger(RegistryReader.class); - private final XPath xpath; + private static URL registryUrl = null; - /** Create a new registry reader object. */ - public RegistryReader() { - XPathFactory xpf = XPathFactory.newInstance(); - xpath = xpf.newXPath(); - NSContext nsContext = new NSContext(); - nsContext.add("cmd", "http://www.clarin.eu/cmd/"); - xpath.setNamespaceContext(nsContext); - } + private static final Map modelCache = new HashMap<>(); + + //JsonPath configuration + private static com.jayway.jsonpath.Configuration conf = com.jayway.jsonpath.Configuration.defaultConfiguration(); /** - * Get a list of all OAI-PMH endpoint URLs defined in the - * specified registry. - * - * @param registryUrl url of the registry endpoint - * @return list of all OAI-PMH endpoint URLs + * Create a new registry reader object. */ - public List getEndpoints(URL registryUrl) { - // Basically this makes a simple REST call to get a list of - // addresses for a further batch of REST calls. This is not - // documented in detail since it's specific to the CLARIN - // registry implementation anyway. - List endpoints = new ArrayList<>(); - try { - Document doc = openRemoteDocument(registryUrl); - List provUrls = getProviderInfoUrls(doc); - - logger.info("Fetching information on " + provUrls.size() - + " centres"); - for (String centreInfoUrl : provUrls) { - doc = openRemoteDocument(new URL(centreInfoUrl)); - NodeList ends = getEndpoints(doc); - if (ends != null) { - for (int i =0;i> getEndPointOaiPmhSetMap(URL registryUrl) { - // Basically this makes a simple REST call to get a list of - // addresses for a further batch of REST calls. This is not - // documented in detail since it's specific to the CLARIN - // registry implementation anyway. - final Map> map = new HashMap<>(); - try { - final Document centresDoc = openRemoteDocument(registryUrl); - final List provUrls = getProviderInfoUrls(centresDoc); - - logger.info("Fetching information on " + provUrls.size() - + " centres"); - - for (String centreInfoUrl : provUrls) { - final Document centreDoc = openRemoteDocument(new URL(centreInfoUrl)); - final NodeList endpointsList = getEndpoints(centreDoc); - if (endpointsList != null) { - for (int i =0;i sets = getOaiPmhSetsForEndpoint(centreDoc, endpoint); - map.put(endpoint, sets); - } - } - } - } catch (IOException | ParserConfigurationException | SAXException - | XPathExpressionException | DOMException e) { - logger.error("Error reading from centre registry", e); - } - return map; + public RegistryReader(URL url) { + this.registryUrl = url; + + conf.addOptions(Option.ALWAYS_RETURN_LIST,Option.SUPPRESS_EXCEPTIONS); } - /** - * Extract links to all provider information pages from the summary - * document returned by the centre registry - * - * @param doc center registry cycle response - * @return list of URLs of provider-specific info pages - * @throws XPathExpressionException problem with the paths to query the center registry response - */ - public List getProviderInfoUrls(Document doc) throws XPathExpressionException { - if (doc == null) { - logger.warn("The centre registry response is missing"); - return Collections.emptyList(); - } + private HttpURLConnection getConnection(URL url, String contentType) throws IOException { + HttpURLConnection connection = null; + Boolean redirect = false; - NodeList centres = (NodeList) xpath.evaluate("/Centers/CenterProfile/Center_id_link/text()", - doc.getDocumentElement(), XPathConstants.NODESET); - List provUrls = new ArrayList<>(); - for (int j=0; j getOaiPmhSetsForEndpoint(final Document centreDoc, final String endpoint) throws XPathExpressionException, DOMException { - Set sets = new HashSet<>(); - final NodeList setList = getOaiPmhSets(centreDoc, endpoint); - if(setList == null) { - logger.debug("No set list for endpoint {}", endpoint); + private DocumentContext getModel(String model) throws IOException { + DocumentContext res = null; + if (modelCache.containsKey(model)) { + res = modelCache.get(model); } else { - for(int s=0;s getEndpoints() throws IOException { + DocumentContext model = getModel("OAIPMHEndpoint"); + List endpoints = model.read("$..uri"); + logger.info("Found " + endpoints.size() + " endpoints"); + return endpoints; + } - NodeList sets = (NodeList) xpath.evaluate("/cmd:CMD/cmd:Components/cmd:CenterProfile/cmd:CenterExtendedInformation/cmd:Metadata[cmd:OaiAccessPoint='" + endpoint +"']/cmd:OaiPmhSets/cmd:Set", - providerInfo.getDocumentElement(), XPathConstants.NODESET); - return sets; + private List getEndpoint(String url) throws IOException { + DocumentContext model = getModel("OAIPMHEndpoint"); + Filter uriFilter = filter(where("uri").is(url)); + return model.read("$.fields[?]",uriFilter); } - - /** - * Fetch the XML document located at the given URL, parse it, and - * return the resulting DOM tree. - */ - private static Document openRemoteDocument(URL url) throws IOException, - ParserConfigurationException, SAXException { - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setInstanceFollowRedirects(false); - connection.setRequestMethod("GET"); - connection.setRequestProperty("Content-Type", "application/xml"); - connection.connect(); - - connection.getResponseCode(); + + String endpointMapping(String endpointUrl,String endpointName) throws IOException { + String directoryName = Util.toFileFormat(endpointName).replaceAll("/", ""); - Boolean redirect = false; + Filter provFilter = filter(where("@.fields.uri").is(endpointUrl)); + List iList = (List)getModel("OAIPMHEndpoint").read("$.[?].fields.centre", provFilter); + Integer centreKey = iList.size()>0? iList.get(0):null; - int status = connection.getResponseCode(); - if (status != HttpURLConnection.HTTP_OK) { - if (status == HttpURLConnection.HTTP_MOVED_TEMP - || status == HttpURLConnection.HTTP_MOVED_PERM - || status == HttpURLConnection.HTTP_SEE_OTHER) { - redirect = true; + String centreName = ""; + String nationalProject = ""; + + if (centreKey != null) { + Filter centreFilter = filter(where("pk").is(centreKey)); + List sList = (List)getModel("Centre").read("$[?].fields.name", centreFilter); + centreName = sList.size()>0? sList.get(0):""; + + iList = (List)getModel("Centre").read("$[?].fields.consortium", centreFilter); + Integer consortiumKey = iList.size()>0? iList.get(0):null; + + if (consortiumKey != null) { + Filter consortiumFilter = filter(where("pk").is(consortiumKey)); + sList = (List)getModel("Consortium").read("$[?].fields.name", consortiumFilter); + nationalProject = sList.size()>0? sList.get(0):""; } } - if (redirect) { - - // get redirect url from "location" header field - String newUrl = connection.getHeaderField("Location"); + return String.format("\"%s\",\"%s\",\"%s\",\"%s\"", endpointUrl.replaceAll("\"", "\"\""), directoryName.replaceAll("\"", "\"\""), centreName.replaceAll("\"", "\"\""), nationalProject.replaceAll("\"", "\"\"")); + } - // open the new connnection again - - connection = (HttpURLConnection) new URL(newUrl).openConnection(); - connection.setInstanceFollowRedirects(false); - connection.setRequestMethod("GET"); - connection.setRequestProperty("Content-Type", "application/xml"); - - logger.debug("Redirect to URL : " + newUrl); - logger.debug(System.getProperty("java.runtime.version")); - - connection.connect(); - - connection.getResponseCode(); + public Map> getEndPointOaiPmhSetMap() { + final Map> map = new HashMap<>(); + try { + final List provUrls = getEndpoints(); + + List sList = null; + for (String provUrl : provUrls) { + Set setdef = new HashSet<>(); + //JsonPath-> $[?(@.fields.uri=='http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl')].fields.oai_pmh_sets + Filter provFilter = filter(where("@.fields.uri").is(provUrl)); + List s = (List)getModel("OAIPMHEndpoint").read("$.[?].fields.oai_pmh_sets", provFilter); + for(net.minidev.json.JSONArray set:s) { + for (Iterator iter = set.iterator();iter.hasNext();) { + Filter setFilter = filter(where("pk").is((Integer)iter.next())); + sList = (List)getModel("OAIPMHEndpointSet").read("$[?].fields.set_spec", setFilter); + String setSpec = (sList.size()>0 ? sList.get(0) : null); + sList = (List)getModel("OAIPMHEndpointSet").read("$[?].fields.set_type", setFilter); + String setType = (sList.size()>0 ? sList.get(0) : null); + if (setSpec!=null && setType!=null) + setdef.add(new CentreRegistrySetDefinition(setSpec, setType)); + } + } + map.put(provUrl, setdef); + } + } catch (IOException e) { + logger.error("Error reading from centre registry", e); } - - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - DocumentBuilder db = dbf.newDocumentBuilder(); - return db.parse(connection.getInputStream()); + return map; } + } diff --git a/src/main/java/nl/mpi/oai/harvester/control/Worker.java b/src/main/java/nl/mpi/oai/harvester/control/Worker.java index 5758d57c..aadac623 100644 --- a/src/main/java/nl/mpi/oai/harvester/control/Worker.java +++ b/src/main/java/nl/mpi/oai/harvester/control/Worker.java @@ -18,10 +18,6 @@ package nl.mpi.oai.harvester.control; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.Arrays; import nl.mpi.oai.harvester.Provider; import nl.mpi.oai.harvester.StaticProvider; import nl.mpi.oai.harvester.action.ActionSequence; @@ -33,6 +29,10 @@ import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.ThreadContext; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; import java.util.List; /** @@ -71,7 +71,6 @@ retrieve each record in the list individually. ListRecords: skip the * Associate a provider and action actionSequences with a scenario * * @param provider OAI-PMH provider that this thread will harvest - * @param actionSequences list of actions to take on harvested metadata * @param cycle the harvesting cycle */ public Worker(Provider provider, Configuration config, @@ -107,8 +106,12 @@ public void run() { PrintWriter m = null; try { m = new PrintWriter(new FileWriter(map,true)); - m.printf("%s,%s", provider.getOaiUrl(),Util.toFileFormat(provider.getName()).replaceAll("/", "")); - m.println(); + if (config.hasRegistryReader()) { + m.println(config.getRegistryReader().endpointMapping(provider.getOaiUrl(),provider.getName())); + } else { + m.printf("%s,%s,,", provider.getOaiUrl(),Util.toFileFormat(provider.getName()).replaceAll("/", "")); + m.println(); + } } catch (IOException e) { logger.error("failed to write to the map file!",e); } finally { @@ -216,8 +219,10 @@ public void run() { // report back success or failure to the cycle endpoint.doneHarvesting(done); - FileSynchronization.saveStatistics(provider); - endpoint.setIncrement(FileSynchronization.getProviderStatistic(provider).getHarvestedRecords()); + if (config.isIncremental()) { + FileSynchronization.saveStatistics(provider); + endpoint.setIncrement(FileSynchronization.getProviderStatistic(provider).getHarvestedRecords()); + } logger.info("Processing finished for " + provider); } catch (Throwable e) { logger.error("Processing failed for " + provider+": "+e.getMessage(),e); @@ -238,4 +243,4 @@ public void run() { } } -} \ No newline at end of file +} diff --git a/src/main/java/nl/mpi/oai/harvester/harvesting/FormatHarvesting.java b/src/main/java/nl/mpi/oai/harvester/harvesting/FormatHarvesting.java index 7dfe0b1e..9f1e1809 100644 --- a/src/main/java/nl/mpi/oai/harvester/harvesting/FormatHarvesting.java +++ b/src/main/java/nl/mpi/oai/harvester/harvesting/FormatHarvesting.java @@ -127,7 +127,7 @@ public boolean request() { int retryDelay = provider.getRetryDelay(i-1); if (retryDelay > 0) { try { - Thread.sleep(retryDelay); + Thread.sleep(retryDelay*1000); } catch (InterruptedException e) { logger.error(e.getMessage(), e); } diff --git a/src/main/java/nl/mpi/oai/harvester/harvesting/ListHarvesting.java b/src/main/java/nl/mpi/oai/harvester/harvesting/ListHarvesting.java index 863ebbca..d51bc7ed 100644 --- a/src/main/java/nl/mpi/oai/harvester/harvesting/ListHarvesting.java +++ b/src/main/java/nl/mpi/oai/harvester/harvesting/ListHarvesting.java @@ -284,7 +284,7 @@ public boolean request() { int retryDelay = provider.getRetryDelay(i-1); if (retryDelay > 0) { try { - Thread.sleep(retryDelay); + Thread.sleep(retryDelay*1000); } catch (InterruptedException e) { logger.error(e.getMessage(), e); } diff --git a/src/main/java/nl/mpi/oai/harvester/harvesting/OAIHelper.java b/src/main/java/nl/mpi/oai/harvester/harvesting/OAIHelper.java index 1a24deb8..5d186939 100644 --- a/src/main/java/nl/mpi/oai/harvester/harvesting/OAIHelper.java +++ b/src/main/java/nl/mpi/oai/harvester/harvesting/OAIHelper.java @@ -3,6 +3,8 @@ import java.util.logging.Level; import javax.xml.namespace.QName; import javax.xml.stream.XMLStreamException; + +import com.ctc.wstx.exc.WstxUnexpectedCharException; import org.w3c.dom.Document; import org.w3c.dom.Node; @@ -93,8 +95,13 @@ static public String getPrefix (DocumentSource document){ } break; } + outer: if (xmlr.hasNext()) - xmlr.next(); + try { + xmlr.next(); + } catch (WstxUnexpectedCharException ex) { + logger.info("Invalid char found in XML, skipping the current one and look for next one"); + } else state = state == 1? 0: -1;// if START then STOP else ERROR } diff --git a/src/main/java/nl/mpi/oai/harvester/harvesting/RecordHarvesting.java b/src/main/java/nl/mpi/oai/harvester/harvesting/RecordHarvesting.java index 70f834ef..0053cef5 100644 --- a/src/main/java/nl/mpi/oai/harvester/harvesting/RecordHarvesting.java +++ b/src/main/java/nl/mpi/oai/harvester/harvesting/RecordHarvesting.java @@ -110,7 +110,7 @@ public boolean request() { int retryDelay = provider.getRetryDelay(i-1); if (retryDelay > 0) { try { - Thread.sleep(retryDelay); + Thread.sleep(retryDelay*1000); } catch (InterruptedException e) { logger.error(e.getMessage(), e); } diff --git a/src/main/resources/config-clarin-clarin.xml b/src/main/resources/config-clarin-clarin.xml deleted file mode 100644 index f3c9c85e..00000000 --- a/src/main/resources/config-clarin-clarin.xml +++ /dev/null @@ -1,138 +0,0 @@ - - - - - - - workspace - - - 2 - - - 10000 - - - 6 - - - 4 - - - 60 - - - state.xml - - - map.csv - - - false - - ListRecords - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - trolling - - - - diff --git a/src/main/resources/config-others-others.xml b/src/main/resources/config-others-others.xml deleted file mode 100644 index 202230fb..00000000 --- a/src/main/resources/config-others-others.xml +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - workspace-olac - - - 2 - - - 10000 - - - 6 - - - 4 - - - 60 - - - state.xml - - - false - - ListRecords - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0afede87-2bf2-4d89-867e-d2ee57251c62 - - - - diff --git a/src/main/resources/config-test-test.xml b/src/main/resources/config-test-test.xml index 3b0f7fc4..5c3a222a 100644 --- a/src/main/resources/config-test-test.xml +++ b/src/main/resources/config-test-test.xml @@ -9,8 +9,9 @@ 2 - - 10000 + + 10 20 4 @@ -21,8 +22,12 @@ 60 + + ListRecords - false @@ -40,6 +45,7 @@ + @@ -47,6 +53,7 @@ + @@ -56,8 +63,9 @@ + - + @@ -66,8 +74,9 @@ + - + @@ -94,7 +103,23 @@ ---> + + + + + +--> + provider url="https://oai.datacite.org/oai"> + Subject and Keywords = Humanities - Languages and literature (6.2) or Humanities - Other humanities (6.5))-/-> + .//dc:subject[contains(.,'(6.2)') or contains(.,'(6.5)')] + DELFT.UU-/-> + --> + + .//*:title[contains(.,'dwangarbeider')] + + diff --git a/src/main/resources/filter.xsl b/src/main/resources/filter.xsl new file mode 100644 index 00000000..9988c47e --- /dev/null +++ b/src/main/resources/filter.xsl @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + DBG: provider_uri[] filter[] + + + + + + + + INF: skipped record[] + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/resources/isebel.xml b/src/main/resources/isebel.xml new file mode 100644 index 00000000..d14c085a --- /dev/null +++ b/src/main/resources/isebel.xml @@ -0,0 +1,52 @@ + + + + + false + + /Users/vic/Documents/DANS/projects/ODISSEI/oai-harvest-manager/data + + 2 + + 10000 + + 4 + + 4 + + 60 + ListRecords + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/scripts/expand-map.sh b/src/main/scripts/expand-map.sh deleted file mode 100755 index 5f6a5d64..00000000 --- a/src/main/scripts/expand-map.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -if [ $# -ne 1 ]; then - echo "One argument is needed" - exit 1 -fi - -JAVA="java" -READLINK="readlink" - -function ensureSlash(){ - length=${1}-1 - - # If the parameter passed to the function does not end with a slash, append - # one and return the result - if [ "{$1:length}" != "/" ]; then - echo ${1}/ - fi -} - -export LANG=en_US.UTF-8 - -# Do not assume the script is invoked from the directory it is located in; get -# the directory the script is located in -thisDir="$(dirname "$(${READLINK} -f "$0")")" - -# Get Saxon -if [ ! -f ${thisDir}/saxon9he.jar ]; then - wget -O SaxonHE9-8-0-11J.zip https://sourceforge.net/projects/saxon/files/Saxon-HE/9.8/SaxonHE9-8-0-11J.zip/download - unzip SaxonHE9-8-0-11J.zip saxon9he.jar - rm SaxonHE9-8-0-11J.zip - if [ ! -f ${thisDir}/saxon9he.jar ]; then - mv saxon9he.jar ${thisDir}/saxon9he.jar - fi -fi -JAR=${thisDir}/saxon9he.jar - -echo Command: "${JAVA} -jar ${JAR} -xsl:./resources/expandMap.xsl -it:main map=$1" -nice ${JAVA} -jar ${JAR} -xsl:${thisDir}/expandMap.xsl -it:main map=$1 2> ${thisDir}/expand-map.log > ${thisDir}/expand-map.csv -if [ $? -ne 0 ]; then - echo "Failed to expand map" - cat ${thisDir}/expand-map.log - exit 1 -fi -mv $1 $1.bak -mv ${thisDir}/expand-map.csv $1 \ No newline at end of file diff --git a/src/main/scripts/expandMap.xsl b/src/main/scripts/expandMap.xsl deleted file mode 100644 index f0a8e63d..00000000 --- a/src/main/scripts/expandMap.xsl +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - DBG: CSV headers[] - - - - - - ERR: CSV[] line[] has [] cells, but the header indicates that [] cells are expected! - - - - - - - - - - - - ERR: couldn't load CSV[]! - - - - - ERR: no data loaded from CSV[]! - - endpointUrl,directoryName,centreName,nationalProject - - - - - - - - - - , - - , - - " - - " - - , - - " - - - " - - - - - - \ No newline at end of file diff --git a/src/main/scripts/oai-harvest.py b/src/main/scripts/oai-harvest.py index 8bb87f99..fc2c985a 100755 --- a/src/main/scripts/oai-harvest.py +++ b/src/main/scripts/oai-harvest.py @@ -15,7 +15,7 @@ def __init__(self, message, args=[]): class OaiHarvest: - def __init__(self, conf=None, oai="/app/oai", base="/app/workdir", output="test", name="test", jvm="-Xmx1G", postgres="oai:oai@localhost:5432/oai", verbose=False): + def __init__(self, conf=None, dry=None, oai="/app/oai", base="/app/workdir", output="test", name="test", jvm="-Xmx1G", postgres="oai:oai@localhost:5432/oai", verbose=False): self.verbose = verbose self.oai = oai @@ -28,6 +28,8 @@ def __init__(self, conf=None, oai="/app/oai", base="/app/workdir", output="test" self.jvm = jvm + self.dry = dry + self.pg = False if postgres: self.pg = True @@ -46,7 +48,6 @@ def __init__(self, conf=None, oai="/app/oai", base="/app/workdir", output="test" self.psql = local["psql"] self.harvester = local[os.path.join(oai, "run-harvester.sh")] - self.mapexpander = local[os.path.join(oai, "expand-map.sh")] self.viewer = local[os.path.join(oai, "run-viewer.sh")] self.workdir = os.path.join(base, "workdir", "%s-%s" % (output, name)) self.logdir = os.path.join(self.workdir, "log") @@ -61,6 +62,7 @@ def __init__(self, conf=None, oai="/app/oai", base="/app/workdir", output="test" if self.verbose: self.print_to_stdout("Config:\n") + self.print_to_stdout("\tdry run: %s\n" % self.dry) self.print_to_stdout("\tconf dir: %s\n" % self.confdir) self.print_to_stdout("\tconf file: %s\n" % self.config_file) self.print_to_stdout("\tlog dir: %s\n" % self.logdir) @@ -83,10 +85,6 @@ def run(self): self.run_harvest() self.print_to_stdout("\tDone\n") - self.print_to_stdout("\tExpand map.\n") - self.expand_map() - self.print_to_stdout("\tDone\n") - self.print_to_stdout("\tReset output.\n") self.do_reset() self.print_to_stdout("\tDone\n") @@ -161,6 +159,8 @@ def run_harvest(self): "map-file=%s" % os.path.join(self.workdir, "map.csv"), conf ] + if self.dry == True: + command.insert(0,"dry-run=true") if self.verbose: self.print_to_stdout("\t\tHarvester command:\n") @@ -171,23 +171,6 @@ def run_harvest(self): return self.harvester(command) - def expand_map(self): - """ - Expand the map - """ - command = [ - os.path.join(self.workdir, "map.csv") - ] - - if self.verbose: - self.print_to_stdout("\t\tExpander command:\n") - self.print_to_stdout("\t\t\t%s " % self.mapexpander) - for i in command: - self.print_to_stdout("%s " % i) - self.print_to_stdout("\n") - - return self.mapexpander(command) - def do_reset(self): """ start with a fresh output @@ -352,28 +335,38 @@ class App(cli.Application): VERSION = "0.0.1" verbose = cli.Flag(["v", "verbose"], help="Verbose output") confdir = None + dry = None output = None name = None postgres = None @cli.switch(["-c", "--config"], str, mandatory=False, help="Config directory (can be online). (optional)") def set_config(self, config): - self.confdir = config + if config: + self.confdir = config + + @cli.switch(["-d", "--dry"], str, mandatory=False, help="Dry run. (optional)") + def set_dry(self, dry): + if dry: + self.dry = True @cli.switch(["-o", "--output"], str, mandatory=True, help="Output folder (collection) this harvest is part of.") def set_output(self, output): - self.output = output + if output: + self.output = output @cli.switch(["-n", "--name"], str, mandatory=True, help="Name for this harvest run.") def set_name(self, name): - self.name = name + if name: + self.name = name @cli.switch(["-p", "--postgres"], str, mandatory=False, help="Postgres database (:@:/) to connext to. (optional)") def set_postgres(self, postgres): - self.postgres = postgres + if postgres: + self.postgres = postgres def main(self): - oai = OaiHarvest(conf=self.confdir, output=self.output, name=self.name, postgres=self.postgres, verbose=self.verbose) + oai = OaiHarvest(conf=self.confdir, dry=self.dry, output=self.output, name=self.name, postgres=self.postgres, verbose=self.verbose) try: oai.run() except Exception as e: @@ -388,4 +381,4 @@ def main(self): sys.exit(0) if __name__ == "__main__": - App.run() + App.run() \ No newline at end of file diff --git a/src/main/scripts/run-harvester.sh b/src/main/scripts/run-harvester.sh index 51a7ad01..73becb1e 100755 --- a/src/main/scripts/run-harvester.sh +++ b/src/main/scripts/run-harvester.sh @@ -27,6 +27,8 @@ else LOG_DIR=$thisDir fi +CLASSPATH="`find $thisDir -type f -name '*.jar' -exec echo -n "{}:" \;`$CLASSPATH" + PROPS="${PROPS} -Dlogdir=${LOG_DIR} -Dhttp.user=Mozilla/5.0" -nice ${JAVA} ${PROPS} -jar ${JAR} $* +nice ${JAVA} -cp ${CLASSPATH}:${JAR}${PROPS} nl.mpi.oai.harvester.control.Main $* diff --git a/src/test/java/nl/mpi/oai/harvester/control/ConfigurationTest.java b/src/test/java/nl/mpi/oai/harvester/control/ConfigurationTest.java index 7b02f898..182c74ca 100644 --- a/src/test/java/nl/mpi/oai/harvester/control/ConfigurationTest.java +++ b/src/test/java/nl/mpi/oai/harvester/control/ConfigurationTest.java @@ -44,10 +44,15 @@ public class ConfigurationTest { private final static Logger logger = LoggerFactory.getLogger(ConfigurationTest.class); private static final String BASIC_CONFIG_RESOURCE = "/config/test-config-basic.xml"; - final String PROVIDER_INFO_RESOURCE = "/centre-registry-providerinfo.xml"; - final String REGISTRY_OVERVIEW_RESOURCE = "/centre-registry-overview.xml"; - final String MOCK_REGISTRY_REGISTRY_PATH = "/"; - final String MOCK_REGISTRY_CENTRE_INFO_RESOURCE_PATH = "/restxml/1"; + private static final String REGISTRY_PATH = "/model"; + private static final String REGISTRY_CENTRE_INFO = REGISTRY_PATH + "/Centre"; + private static final String REGISTRY_CENTRE_RESOURCE = "/centre-registry-Centre.json"; + private static final String REGISTRY_ENDPOINT_INFO = REGISTRY_PATH + "/OAIPMHEndpoint"; + private static final String REGISTRY_ENDPOINT_RESOURCE = "/centre-registry-OAIPMHEndpoint.json"; + private static final String REGISTRY_SET_INFO = REGISTRY_PATH + "/OAIPMHEndpointSet"; + private static final String REGISTRY_SET_RESOURCE = "/centre-registry-OAIPMHEndpointSet.json"; + private static final String REGISTRY_CONSORTIUM_INFO = REGISTRY_PATH + "/Consortium"; + private static final String REGISTRY_CONSORTIUM_RESOURCE = "/centre-registry-Consortium.json"; private static Configuration BASIC_CONFIG; @@ -111,7 +116,7 @@ public void testImportFromRegistry() throws Exception { final List providers = configuration.getProviders(); assertNotNull(providers); - assertEquals(2, providers.size()); + assertEquals(50, providers.size()); //check specific provider { @@ -126,7 +131,7 @@ public void testImportFromRegistry() throws Exception { //second provider { - final Optional prov = providers.stream().filter(p -> p.getOaiUrl().equals("http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai2.pl")).findAny(); + final Optional prov = providers.stream().filter(p -> p.getOaiUrl().equals("http://www.nb.no/clarino/oai")).findAny(); assertTrue(prov.isPresent()); //should have no sets @@ -180,16 +185,22 @@ private File fileForResource(String resource, Function filter) { } private String setUpMockRegistry() throws IOException { - stubFor(get(urlEqualTo(MOCK_REGISTRY_CENTRE_INFO_RESOURCE_PATH)) - .willReturn(aResponse() - .withBody(getResourceAsString(PROVIDER_INFO_RESOURCE)))); - final String centreInfoUrl = "http://localhost:" + wireMockRule.getOptions().portNumber() + MOCK_REGISTRY_CENTRE_INFO_RESOURCE_PATH; - stubFor(get(urlEqualTo(MOCK_REGISTRY_REGISTRY_PATH)) + //set up mock centre registry REST JSON server + stubFor(get(urlEqualTo(REGISTRY_CENTRE_INFO)) + .willReturn(aResponse() + .withBody(getResourceAsString(REGISTRY_CENTRE_RESOURCE)))); + stubFor(get(urlEqualTo(REGISTRY_ENDPOINT_INFO)) + .willReturn(aResponse() + .withBody(getResourceAsString(REGISTRY_ENDPOINT_RESOURCE)))); + stubFor(get(urlEqualTo(REGISTRY_SET_INFO)) .willReturn(aResponse() - .withBody(getResourceAsString(REGISTRY_OVERVIEW_RESOURCE) - .replaceAll("\\S+", "" + centreInfoUrl + "")))); - final String registryURl = "http://localhost:" + wireMockRule.getOptions().portNumber() + MOCK_REGISTRY_REGISTRY_PATH; + .withBody(getResourceAsString(REGISTRY_SET_RESOURCE)))); + stubFor(get(urlEqualTo(REGISTRY_CONSORTIUM_INFO)) + .willReturn(aResponse() + .withBody(getResourceAsString(REGISTRY_CONSORTIUM_RESOURCE)))); + + final String registryURl = "http://localhost:" + wireMockRule.getOptions().portNumber() + REGISTRY_PATH; return registryURl; } diff --git a/src/test/java/nl/mpi/oai/harvester/control/RegistryReaderTest.java b/src/test/java/nl/mpi/oai/harvester/control/RegistryReaderTest.java index ca1d0621..4697e822 100644 --- a/src/test/java/nl/mpi/oai/harvester/control/RegistryReaderTest.java +++ b/src/test/java/nl/mpi/oai/harvester/control/RegistryReaderTest.java @@ -45,18 +45,22 @@ * * @author Lari Lampen (MPI-PL) * @author twan@clarin.eu + * @author menzo.windhouwer@di.huc.knaw.nl */ public class RegistryReaderTest { - private static final String PROVIDER_INFO_RESOURCE = "/centre-registry-providerinfo.xml"; - private static final String REGISTRY_OVERVIEW_RESOURCE = "/centre-registry-overview.xml"; - private static final String REGISTRY_PATH = "/"; - private static final String CENTRE_INFO_RESOURCE_PATH = "/restxml/1"; + private static final String REGISTRY_PATH = "/model"; + private static final String REGISTRY_CENTRE_INFO = REGISTRY_PATH + "/Centre"; + private static final String REGISTRY_CENTRE_RESOURCE = "/centre-registry-Centre.json"; + private static final String REGISTRY_ENDPOINT_INFO = REGISTRY_PATH + "/OAIPMHEndpoint"; + private static final String REGISTRY_ENDPOINT_RESOURCE = "/centre-registry-OAIPMHEndpoint.json"; + private static final String REGISTRY_SET_INFO = REGISTRY_PATH + "/OAIPMHEndpointSet"; + private static final String REGISTRY_SET_RESOURCE = "/centre-registry-OAIPMHEndpointSet.json"; + private static final String REGISTRY_CONSORTIUM_INFO = REGISTRY_PATH + "/Consortium"; + private static final String REGISTRY_CONSORTIUM_RESOURCE = "/centre-registry-Consortium.json"; private String registryURl; - private String centreInfoUrl; - private DocumentBuilder db; - private RegistryReader instance; + private RegistryReader registry; @ClassRule public static WireMockClassRule wireMockRule = new WireMockClassRule(8089); @@ -66,83 +70,71 @@ public class RegistryReaderTest { @Before public void setUp() throws Exception { - instance = new RegistryReader(); - db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + registryURl = "http://localhost:" + wireMockRule.getOptions().portNumber() + REGISTRY_PATH; - //set up mock centre registry REST XML server - stubFor(get(urlEqualTo(CENTRE_INFO_RESOURCE_PATH)) - .willReturn(aResponse() - .withBody(getResourceAsString(PROVIDER_INFO_RESOURCE)))); - centreInfoUrl = "http://localhost:" + wireMockRule.getOptions().portNumber() + CENTRE_INFO_RESOURCE_PATH; + registry = new RegistryReader(new URL(registryURl)); - stubFor(get(urlEqualTo(REGISTRY_PATH)) + //set up mock centre registry REST JSON server + stubFor(get(urlEqualTo(REGISTRY_CENTRE_INFO)) .willReturn(aResponse() - .withBody(getResourceAsString(REGISTRY_OVERVIEW_RESOURCE) - .replaceAll("\\S+", "" + centreInfoUrl + "")))); - registryURl = "http://localhost:" + wireMockRule.getOptions().portNumber() + REGISTRY_PATH; - } + .withBody(getResourceAsString(REGISTRY_CENTRE_RESOURCE)))); + stubFor(get(urlEqualTo(REGISTRY_ENDPOINT_INFO)) + .willReturn(aResponse() + .withBody(getResourceAsString(REGISTRY_ENDPOINT_RESOURCE)))); + stubFor(get(urlEqualTo(REGISTRY_SET_INFO)) + .willReturn(aResponse() + .withBody(getResourceAsString(REGISTRY_SET_RESOURCE)))); + stubFor(get(urlEqualTo(REGISTRY_CONSORTIUM_INFO)) + .willReturn(aResponse() + .withBody(getResourceAsString(REGISTRY_CONSORTIUM_RESOURCE)))); + } /** - * Test of getProviderInfoUrls method, of class RegistryReader. + * Test of getEndpoints method, of class RegistryReader. */ @Test - public void testGetProviderInfoUrlsFromDoc() throws Exception { - Document docSummary = db.parse(getClass().getResourceAsStream(REGISTRY_OVERVIEW_RESOURCE)); - - List result = instance.getProviderInfoUrls(docSummary); - assertEquals(24, result.size()); + public void testGetEndpoints() throws Exception { + List result = registry.getEndpoints(); + assertEquals(50, result.size()); } /** * Test of getEndpoint method, of class RegistryReader. */ @Test - public void testGetEndpointFromDoc() throws Exception { - String expResult = "http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl?verb=Identify"; + public void testGetEndpoint() throws Exception { + String expResult = "http://clarin.dk/oaiprovider/"; - NodeList result = instance.getEndpoints(getProviderInfoDoc()); - assertEquals(expResult, result.item(0).getNodeValue()); - } - - @Test - public void testGetEndpointsFromService() throws Exception { - final List urls = instance.getEndpoints(new URL(registryURl)); - assertEquals(48, urls.size()); // 24 'centres' * 2 endpoints + List result = registry.getEndpoints(); + assertEquals(expResult, result.get(0)); } @Test - public void testGetOaiSetsFromService() throws Exception { - final String endpointUrl1 = "http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl?verb=Identify"; - final String endpointUrl2 = "http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai2.pl?verb=Identify"; + public void testGetOaiSets() throws Exception { + final String endpointUrl1 = "http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl"; + final String endpointUrl2 = "http://clarin.dk/oaiprovider/"; - final Map> map = instance.getEndPointOaiPmhSetMap(new URL(registryURl)); - assertEquals(2, map.size()); + final Map> map = registry.getEndPointOaiPmhSetMap(); + assertEquals(50, map.size()); assertTrue(map.containsKey(endpointUrl1)); assertEquals(2, map.get(endpointUrl1).size()); assertTrue(map.containsKey(endpointUrl2)); assertEquals(0, map.get(endpointUrl2).size()); } - @Test - public void testGetOaiPmhSetsFromDoc() throws Exception { - String endpoint = "http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl?verb=Identify"; - NodeList result = instance.getOaiPmhSets(getProviderInfoDoc(), endpoint); - assertEquals(2, result.getLength()); - } - @Test public void testGetOaiPmhSetsNone() throws Exception { String endpoint = "http://www.clarin.eu"; - NodeList result = instance.getOaiPmhSets(getProviderInfoDoc(), endpoint); - assertEquals(0, result.getLength()); + final Map> map = registry.getEndPointOaiPmhSetMap(); + assertNull(map.get(endpoint)); } - - private Document getProviderInfoDoc() throws SAXException, IOException { - try (InputStream resource = getClass().getResourceAsStream(PROVIDER_INFO_RESOURCE)) { - return db.parse(resource); - } + + @Test + public void testEndpointMapping() throws Exception { + String entry = registry.endpointMapping("http://clarin.dk/oaiprovider/","CLARIN DK OAI"); + assertEquals(entry,"\"http://clarin.dk/oaiprovider/\",\"CLARIN_DK_OAI\",\"The CLARIN, Centre at the \"\"University of Copenhagen\"\"\",\"CLARIN-DK\""); } - + private static String getResourceAsString(String resourceName) throws IOException { final String registryOverviewString; try (InputStream infoResourceStream = RegistryReaderTest.class.getResourceAsStream(resourceName)) { @@ -150,5 +142,4 @@ private static String getResourceAsString(String resourceName) throws IOExceptio } return registryOverviewString; } - } diff --git a/src/test/java/nl/mpi/oai/harvester/harvesting/ListRecordsTestHelper.java b/src/test/java/nl/mpi/oai/harvester/harvesting/ListRecordsTestHelper.java index def51013..755e7a0f 100644 --- a/src/test/java/nl/mpi/oai/harvester/harvesting/ListRecordsTestHelper.java +++ b/src/test/java/nl/mpi/oai/harvester/harvesting/ListRecordsTestHelper.java @@ -59,8 +59,8 @@ MetadataFormat getMetadataFormats() { void addTraces() { // add the traces identifying the records the test should yield - addToList("http://metalb.csc.fi/cgi-bin/que", "cmdi0571", "Language Bank of Finland-0000000"); - addToList("http://metalb.csc.fi/cgi-bin/que", "cmdi0571", "Language Bank of Finland-0000001"); - addToList("http://metalb.csc.fi/cgi-bin/que", "cmdi2312", "Language Bank of Finland-0000002"); + addToList("http://metalb.csc.fi/cgi-bin/que", "cmdi0571", "ListRecords-0-0000000"); + addToList("http://metalb.csc.fi/cgi-bin/que", "cmdi0571", "ListRecords-0-0000001"); + addToList("http://metalb.csc.fi/cgi-bin/que", "cmdi2312", "ListRecords-0-0000002"); } } diff --git a/src/test/java/nl/mpi/oai/harvester/harvesting/TestHelper.java b/src/test/java/nl/mpi/oai/harvester/harvesting/TestHelper.java index 1f424306..9c68975d 100644 --- a/src/test/java/nl/mpi/oai/harvester/harvesting/TestHelper.java +++ b/src/test/java/nl/mpi/oai/harvester/harvesting/TestHelper.java @@ -218,6 +218,7 @@ Provider getNextEndpoint(){ need to retry. Use zero for the maximum number of retries. */ endpoint = new Provider(endpointURIs[eIndex], 0, new int[]{0}); + endpoint.setName(getTestName()+"-"+eIndex); } catch (ParserConfigurationException e) { endpoint = null; e.printStackTrace(); diff --git a/src/test/resources/centre-registry-Centre.json b/src/test/resources/centre-registry-Centre.json new file mode 100644 index 00000000..c46ad27b --- /dev/null +++ b/src/test/resources/centre-registry-Centre.json @@ -0,0 +1,2078 @@ +[{ + "model": "centre_registry.centre", + "pk": 45, + "fields": { + "name": "Austrian Centre for Digital Humanities and Cultural Heritage - A Resource Centre for the HumanitiEs", + "shorthand": "ACDH-ARCHE", + "organisation_name": "Austrian Centre for Digital Humanities and Cultural Heritage", + "institution": "Austrian Centre for Digital Humanities and Cultural Heritage", + "working_unit": "Austrian Centre for Digital Humanities and Cultural Heritage", + "address": "Sonnenfelsgasse 19", + "postal_code": "1010", + "city": "Vienna", + "latitude": "48.209117", + "longitude": "16.377080", + "type_status": "Certified", + "administrative_contact": 94, + "technical_contact": 95, + "website_url": "https://arche.acdh.oeaw.ac.at/", + "description": "ACDH-CH is a research institute at the Austrian Academy of Sciences offering services for the DH community.", + "expertise": "", + "consortium": 2, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-105", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/03/ARCHE.pdf", + "pid_status": "Handles for metadata and resources (common CLARIN prefix 11022)", + "long_term_archiving_policy": "", + "repository_system": "Fedora Commons version 4 + own development", + "strict_versioning": false, + "type": [1], + "assessmentdates": [], + "monitoring_contacts": [96, 2, 3] + } + }, { + "model": "centre_registry.centre", + "pk": 61, + "fields": { + "name": "CLARIN Knowledge Centre for Atypical Communication Expertise", + "shorthand": "ACE", + "organisation_name": "Radboud University", + "institution": "Centre for Language and Speech Technology", + "working_unit": "Centre for Language and Speech Technology", + "address": "P.O. Box 9103", + "postal_code": "6500 HD", + "city": "Nijmegen", + "latitude": "51.819146", + "longitude": "5.863923", + "type_status": "Certified", + "administrative_contact": 79, + "technical_contact": 80, + "website_url": "https://ace.ruhosting.nl", + "description": "Atypical communication encompasses language and speech as encountered during (second) language acquisition and development, and in language disorders, but also more broadly in bilingual language development and in sign language.", + "expertise": "specialised in this type of research and concomitant infrastructural issues related to data acquisition, processing and sharing, which is typically highly characterised by sensitivity issues.", + "consortium": 10, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-154", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [18], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 4, + "fields": { + "name": "ASV Leipzig", + "shorthand": "ASV", + "organisation_name": "Abteilung Automatische Sprachverarbeitung", + "institution": "Universit\u00e4t Leipzig", + "working_unit": "Institut f\u00fcr Informatik", + "address": "Augustusplatz 10", + "postal_code": "04109", + "city": "Leipzig", + "latitude": "51.338548", + "longitude": "12.378735", + "type_status": "Certified", + "administrative_contact": 6, + "technical_contact": 7, + "website_url": "http://asv.informatik.uni-leipzig.de/", + "description": "", + "expertise": "monolingual corpora, text mining", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-88", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/02/CLARIN-D-Resource-Center-Leipzig.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "Providing long-term archiving services.", + "repository_system": "Fedora Commons", + "strict_versioning": false, + "type": [1], + "assessmentdates": [12], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 5, + "fields": { + "name": "Bayerisches Archiv f\u00fcr Sprachsignale", + "shorthand": "BAS", + "organisation_name": "Bayerisches Archiv f\u00fcr Sprachsignale", + "institution": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen", + "working_unit": "BAS CLARIN-D Centre", + "address": "Schellingstra\u00dfe 3", + "postal_code": "80799", + "city": "M\u00fcnchen", + "latitude": "48.149216", + "longitude": "11.579364", + "type_status": "Certified", + "administrative_contact": 8, + "technical_contact": 9, + "website_url": "http://hdl.handle.net/11858/00-1779-0000-000C-DAAF-B", + "description": "", + "expertise": "Speech data processing, web services and tools for spoken language, corpus production and validation, curation of external speech corpora, consulting and teaching.", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-89", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/05/BAS-CLARIN.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "", + "repository_system": "Custom.", + "strict_versioning": true, + "type": [1], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 6, + "fields": { + "name": "Berlin-Brandenburg Academy of Sciences and Humanities", + "shorthand": "BBAW", + "organisation_name": "Berlin-Brandenburg Academy of Sciences and Humanities", + "institution": "Union of the German academies of sciences and humanities", + "working_unit": "Deutsches Textarchiv", + "address": "J\u00e4gerstra\u00dfe 23", + "postal_code": "10117", + "city": "Berlin", + "latitude": "52.514156", + "longitude": "13.394324", + "type_status": "Certified", + "administrative_contact": 10, + "technical_contact": 11, + "website_url": "http://www.bbaw.de/", + "description": "", + "expertise": "TEI encoding", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-93", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/10/CLARIN-Center-BBAW.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "", + "repository_system": "Fedora Commons", + "strict_versioning": true, + "type": [1], + "assessmentdates": [1], + "monitoring_contacts": [11] + } + }, { + "model": "centre_registry.centre", + "pk": 35, + "fields": { + "name": "Centre for the Digital Foundation of Research in the Humanities, Social, and Educational Sciences", + "shorthand": "CEDIFOR", + "organisation_name": " Faculty of Computer Science and Mathematics", + "institution": " Goethe University Frankfurt & TU Darmstadt & DIPF (Frankfurt)", + "working_unit": " Texttechnology Lab (TTLab)", + "address": "Senckenberganlage 31", + "postal_code": "60325", + "city": "Frankfurt, Germany", + "latitude": "50.119051", + "longitude": "8.652244", + "type_status": "Aiming for B", + "administrative_contact": 68, + "technical_contact": 69, + "website_url": "https://www.cedifor.de", + "description": "CEDIFOR is a Digital Humanities Centre funded by the BMBF and located at GU Frankfurt, TU Darmstadt, and DIPF (Frankfurt)", + "expertise": "Advising researchers from the Humanities, Social, and Educational Sciences on adopting computer based methods in their research", + "consortium": 1, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": " PIDs based on ePIC (handles)", + "long_term_archiving_policy": "", + "repository_system": " self-developed", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 15, + "fields": { + "name": "Center of Estonian Language Resources", + "shorthand": "CELR-EKK", + "organisation_name": "Eesti Keeleressursside Keskus", + "institution": "University of Tartu", + "working_unit": "Eesti Keeleressursside Keskus", + "address": "Narva mnt 18 - 3060", + "postal_code": "51009", + "city": "Tartu", + "latitude": "58.378389", + "longitude": "26.714716", + "type_status": "Certified", + "administrative_contact": 26, + "technical_contact": 27, + "website_url": "http://www.keeleressursid.ee/", + "description": "", + "expertise": "corpora, lexica, tools, Estonian", + "consortium": 5, + "type_certificate_url": "", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/11/CELR-META-SHARE.pdf", + "pid_status": "Handle (own server and prefix: 11297).", + "long_term_archiving_policy": "", + "repository_system": "META-SHARE", + "strict_versioning": false, + "type": [1], + "assessmentdates": [2], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 46, + "fields": { + "name": "CLARIN Knowledge Centre for linguistic diversity and language documentation", + "shorthand": "CKLD", + "organisation_name": "University Hamburg ; SOAS London ; University of Cologne", + "institution": "University Hamburg ; SOAS London ; University of Cologne", + "working_unit": "Department of Linguistics", + "address": "University of Cologne", + "postal_code": "50923", + "city": "Cologne", + "latitude": "50.9266658", + "longitude": "6.9304855", + "type_status": "Certified (K)", + "administrative_contact": 98, + "technical_contact": 98, + "website_url": "http://ckld.uni-koeln.de/", + "description": "CKLD is a distributed K-Centre and offers expertise on linguistic diversity and language documentation.", + "expertise": "linguistic diversity and language documentation", + "consortium": null, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-142", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 14, + "fields": { + "name": "The CLARIN, Centre at the \"University of Copenhagen\"", + "shorthand": "CLARIN-DK-UCPH", + "organisation_name": "University of Copenhagen", + "institution": "Faculty of Humanities", + "working_unit": "Centre for Language Technology, Department of Nordic Studies and Linguistics", + "address": "Emil Holms Kanal 2", + "postal_code": "2300", + "city": "Copenhagen S", + "latitude": "55.6618877", + "longitude": "12.5912526", + "type_status": "Certified", + "administrative_contact": 106, + "technical_contact": 25, + "website_url": "http://clarin.dk/", + "description": "", + "expertise": "multimodal communication, corpus linguistics, text tools", + "consortium": 4, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-100", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/07/CLARIN-Centre-at-the-University-of-Copenhagen.pdf", + "pid_status": "Handle (own server and prefixes: 20.500.12115 and 11221).", + "long_term_archiving_policy": "", + "repository_system": "DSpace and eSciDoc", + "strict_versioning": false, + "type": [1, 5], + "assessmentdates": [17], + "monitoring_contacts": [107] + } + }, { + "model": "centre_registry.centre", + "pk": 64, + "fields": { + "name": "CLARIN-IS", + "shorthand": "CLARIN-IS", + "organisation_name": "The \u00c1rni Magn\u00fasson Institute for Icelandic Studies", + "institution": "The \u00c1rni Magn\u00fasson Institute for Icelandic Studies", + "working_unit": "The \u00c1rni Magn\u00fasson Institute for Icelandic Studies", + "address": "Laugavegur 13", + "postal_code": "IS-101", + "city": "Reykjav\u00edk", + "latitude": "64.1464", + "longitude": "-21.9422", + "type_status": "Aiming for B", + "administrative_contact": 127, + "technical_contact": 128, + "website_url": "http://clarin.is", + "description": "", + "expertise": "", + "consortium": 26, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [127] + } + }, { + "model": "centre_registry.centre", + "pk": 36, + "fields": { + "name": "CLARIN-LT", + "shorthand": "CLARIN-LT", + "organisation_name": "CLARIN-LT", + "institution": "Vytautas Magnus university", + "working_unit": "Centre of Computational Linguistics", + "address": "Putvinskio 23-216", + "postal_code": "44212", + "city": "Kaunas", + "latitude": "54.9005572", + "longitude": "23.914398", + "type_status": "Aiming for B", + "administrative_contact": 70, + "technical_contact": 71, + "website_url": "http://www.clarin-lt.lt", + "description": "", + "expertise": "The centre will provide tools, resources and services for the Lithuanian language.", + "consortium": 15, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": " Dspace", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 68, + "fields": { + "name": "CLARIN Centre of Latvian language resources and tools", + "shorthand": "CLARIN-LV", + "organisation_name": "Institute of Mathematics and Computer Science, University of Latvia", + "institution": "Institute of Mathematics and Computer Science, University of Latvia", + "working_unit": "Artificial Intelligence Laboratory", + "address": "Rai\u0146a bulv\u0101ris 29", + "postal_code": "LV-1050", + "city": "Riga", + "latitude": "56.94852", + "longitude": "24.11803", + "type_status": "", + "administrative_contact": 138, + "technical_contact": 139, + "website_url": "https://www.clarin.lv", + "description": "", + "expertise": "", + "consortium": 24, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 25, + "fields": { + "name": "CLARIN-PL Language Technology Centre", + "shorthand": "CLARIN-PL1", + "organisation_name": "Institute of Informatics", + "institution": "Wroclaw University of Technology", + "working_unit": "CLARIN-PL", + "address": "wybrze\u017ce Stanis\u0142awa Wyspia\u0144skiego 27", + "postal_code": "50-370", + "city": "Wroc\u0142aw", + "latitude": "51.107122", + "longitude": "17.063023", + "type_status": "Certified", + "administrative_contact": 146, + "technical_contact": 41, + "website_url": "http://nlp.pwr.wroc.pl/en/projects/clarin", + "description": "Main Centre in CLARIN-PL, providing infrastructural services and access to CLARIN-PL language resources.", + "expertise": "various domains of language technology, language engineering, computational linguistics, lexicography", + "consortium": 11, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-112", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/12/CLARIN-PL-Language-Technology-Centre.pdf", + "pid_status": "Planned, Handle via EPIC.", + "long_term_archiving_policy": "Handle (own server and prefix: 11321).", + "repository_system": "DSpace, LINDAT/CLARIN extensions", + "strict_versioning": false, + "type": [1, 5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 53, + "fields": { + "name": "CLARIN Knowledge Centre for Swedish in a Multilingual Setting (SMS)", + "shorthand": "CLARIN-SMS", + "organisation_name": "CLARIN Knowledge Centre for Swedish in a Multilingual Setting", + "institution": "Stockholm University", + "working_unit": "Department of Linguistics", + "address": "Universitetsv\u00e4gen 10 C", + "postal_code": "SE - 10691", + "city": "Stockholm", + "latitude": "59.334591", + "longitude": "18.063240", + "type_status": "Certified", + "administrative_contact": 113, + "technical_contact": 113, + "website_url": "https://sweclarin.se/eng/centers/sms", + "description": "", + "expertise": "Offers special expertise in the areas of processing of parallel and comparable corpora, including alignment and machine translation", + "consortium": 20, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-151", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [8], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 57, + "fields": { + "name": "CLARIN Knowledge Centre for Speech Analysis", + "shorthand": "CLARIN-SPEECH", + "organisation_name": "KTH School of Computer Science and Communication", + "institution": "KTH Royal Institute of Technology", + "working_unit": "Division of Speech, Music and Hearing", + "address": "Lindstedtsv\u00e4gen 24", + "postal_code": "SE-100 44", + "city": "Stockholm", + "latitude": "59.334591", + "longitude": "18.063240", + "type_status": "Certified", + "administrative_contact": 111, + "technical_contact": 111, + "website_url": "https://www.kth.se/tmh", + "description": "Technical advice on speech analysis", + "expertise": "Expertise in the collection, processing, annotation and exploitation of large multimodal speech corpora", + "consortium": 20, + "type_certificate_url": "http://hdl.handle.net/1839/00-DOCS.CLARIN.EU-117", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [5], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 29, + "fields": { + "name": "CLARINO Bergen Center", + "shorthand": "CLARINO Bergen", + "organisation_name": "University of Bergen", + "institution": "University of Bergen", + "working_unit": "University of Bergen Library and Dept. LLE", + "address": "UiB Dept. LLE, Postboks 7800", + "postal_code": "5020", + "city": "Bergen", + "latitude": "60.2786821", + "longitude": "6.1084835", + "type_status": "B, K (certified)", + "administrative_contact": 50, + "technical_contact": 50, + "website_url": "http://clarino.uib.no/", + "description": "CLARINO Bergen Centre offers a repository, a corpus management and query system, a treebanking infrastructure and a CMDI editor.", + "expertise": "Language resources; corpus services, treebanking services, metadata services.", + "consortium": 14, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-116", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/11/CLARINO-Bergen-Centre.pdf", + "pid_status": "Handle (own prefix)", + "long_term_archiving_policy": "", + "repository_system": "DSpace (LINDAT mod)", + "strict_versioning": true, + "type": [1, 5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 30, + "fields": { + "name": "CLARIN.SI Language Technology Centre", + "shorthand": "CLARINSI", + "organisation_name": "The Slovenian CLARIN.SI research infrastructure", + "institution": "Jo\u017eef Stefan Institute", + "working_unit": "Dept. of Knowledge Technologies, Artificial Intelligence Lab., Networking Infrastructure Centre", + "address": "Jamova cesta 39", + "postal_code": "SI \u2013 1000", + "city": "Ljubljana", + "latitude": "46.042068", + "longitude": "14.487589", + "type_status": "Certified", + "administrative_contact": 53, + "technical_contact": 52, + "website_url": "http://www.clarin.si/info/about/", + "description": "", + "expertise": "", + "consortium": 18, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-115", + "dsa_url": "https://assessment.datasealofapproval.org/assessment_190/seal/html/", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "DSpace.", + "strict_versioning": false, + "type": [1], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 59, + "fields": { + "name": "CLARIN Knowledge Centre for South Slavic Languages", + "shorthand": "CLASSLA", + "organisation_name": "The Slovenian CLARIN.SI research infrastructure", + "institution": "Jo\u017eef Stefan Institute", + "working_unit": "Jo\u017eef Stefan Institute", + "address": "Jamova cesta 39", + "postal_code": "SI \u2013 1000", + "city": "Ljubljana", + "latitude": "46.042068", + "longitude": "14.487589", + "type_status": "Certified", + "administrative_contact": 120, + "technical_contact": 120, + "website_url": "http://www.clarin.si/info/k-centre/", + "description": "The centre offers expertise on language resources and technologies for South Slavic languages", + "expertise": "", + "consortium": 18, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-153", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [14], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 40, + "fields": { + "name": "Centre for Language and Speech Technology", + "shorthand": "CLST", + "organisation_name": "Radboud University", + "institution": "Centre for Language and Speech Technology", + "working_unit": "Centre for Language and Speech Technology", + "address": "P.O. Box 9103", + "postal_code": "6500 HD", + "city": "Nijmegen", + "latitude": "51.819146", + "longitude": "5.863923", + "type_status": "", + "administrative_contact": 79, + "technical_contact": 80, + "website_url": "http://www.ru.nl/clst", + "description": "CLST was founded in January 2003. Its objective is to contribute to the development of language and speech technology.", + "expertise": "Language and speech technology in the contexts of data mining, language learning and speech therapy", + "consortium": 10, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "Our data are presently offered via MPI (https://corpus1.mpi.nl/ds/asv/?0&openpath=node:2102153)", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 18, + "fields": { + "name": "CMU-TalkBank", + "shorthand": "CMU", + "organisation_name": "TalkBank", + "institution": "Carnegie Mellon University", + "working_unit": "Psychology and Language Technologies Institute", + "address": "5000 Forbes Avenue", + "postal_code": "PA 15213", + "city": "Pittsburgh", + "latitude": "40.440988", + "longitude": "-79.94231", + "type_status": "Certified", + "administrative_contact": 32, + "technical_contact": 32, + "website_url": "https://talkbank.org", + "description": "TalkBank data and tools.", + "expertise": "", + "consortium": null, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-109", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2017/10/TalkBank.pdf", + "pid_status": "Handle (own server and prefix: 11312).", + "long_term_archiving_policy": "", + "repository_system": "GIT", + "strict_versioning": false, + "type": [1, 5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 51, + "fields": { + "name": "Czech National Corpus", + "shorthand": "CNC", + "organisation_name": "Charles University", + "institution": "Faculty of Arts", + "working_unit": "Institute of the Czech National Corpus", + "address": "N\u00e1m. Jana Palacha 2", + "postal_code": "116 38", + "city": "Praha", + "latitude": "50.0889171", + "longitude": "14.4160353", + "type_status": "", + "administrative_contact": 110, + "technical_contact": 110, + "website_url": "http://www.korpus.cz", + "description": "CNC is a research infrastructure (RI) recognized by the Czech goverment and included on the national RI roadmap.", + "expertise": "Continuous mapping of Czech by building and annotating large general-purpose language corpora and providing access to them.", + "consortium": 3, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [3], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 33, + "fields": { + "name": "Collections de corpus oraux numeriques ", + "shorthand": "COCOON", + "organisation_name": "CNRS/COCOON ", + "institution": "Centre National de la Recherche Scientifique ", + "working_unit": " Laboration lig\u00e9rien de linguistique (LLL) & Langues et civilisations \u00e0 tradition orale (LACITO) ", + "address": " Biblioth\u00e8que nationale de France, Quai Fran\u00e7ois Mauriac, 75013 Paris ", + "postal_code": "75013", + "city": "Paris", + "latitude": "48.8335842", + "longitude": "2.3735772", + "type_status": "", + "administrative_contact": 60, + "technical_contact": 61, + "website_url": "http://cocoon.huma-num.fr", + "description": "Speech recordings repository", + "expertise": "", + "consortium": 23, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "ARK and PURL", + "long_term_archiving_policy": "", + "repository_system": " self-developed ", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 20, + "fields": { + "name": "Data Archiving and Networked Services", + "shorthand": "DANS", + "organisation_name": "Data Archiving and Networked Services", + "institution": "Royal Netherlands Academy of Arts and Sciences", + "working_unit": "DANS", + "address": "Anna van Saksenlaan 51", + "postal_code": "2593 HW", + "city": "Den Haag", + "latitude": "52.080776", + "longitude": "4.345627", + "type_status": "Aiming for B.", + "administrative_contact": 34, + "technical_contact": 34, + "website_url": "http://dans.knaw.nl/", + "description": "", + "expertise": "Sustainable access to digital research data, training and advice, and performs research into sustained access to digital information.", + "consortium": 10, + "type_certificate_url": "", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/04/DANS-Electronic-Archiving-SYstem-EASY-.pdf", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "EASY, based on Fedora Commons.", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 62, + "fields": { + "name": "DARIAH-DE Repository", + "shorthand": "DH-REP", + "organisation_name": "DARIAH-DE", + "institution": "Nieders\u00e4chsische Staats- und Universit\u00e4tsbibliothek G\u00f6ttingen", + "working_unit": "Research and Development Department", + "address": "Papendiek 14", + "postal_code": "37073", + "city": "G\u00f6ttingen", + "latitude": "51.562323", + "longitude": "9.970938", + "type_status": "", + "administrative_contact": 125, + "technical_contact": 124, + "website_url": "https://repository.de.dariah.eu/doc/services/index.html", + "description": "", + "expertise": "", + "consortium": null, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 54, + "fields": { + "name": "Clarin Knowledge Centre on Diachronic Language Resources", + "shorthand": "DiaRes", + "organisation_name": "Uppsala University", + "institution": "Uppsala University", + "working_unit": "Department of Linguistics and Philology", + "address": "Box 635", + "postal_code": "SE-75126", + "city": "Uppsala", + "latitude": "59.511619", + "longitude": "17.372639", + "type_status": "Certified", + "administrative_contact": 114, + "technical_contact": 114, + "website_url": "https://sweclarin.se/eng/centers/diares", + "description": "", + "expertise": "Focus on diachrionic text collections, historical texts, and tools and resources for processing and analysing them.", + "consortium": 20, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-149", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [8], + "monitoring_contacts": [114] + } + }, { + "model": "centre_registry.centre", + "pk": 1, + "fields": { + "name": "Eberhard Karls Universit\u00e4t T\u00fcbingen", + "shorthand": "EKUT", + "organisation_name": "Eberhard Karls Universit\u00e4t T\u00fcbingen", + "institution": "Seminar f\u00fcr Sprachwissenschaft", + "working_unit": "Allgemeine Sprachwissenschaft und Computerlinguistik", + "address": "Wilhelmstra\u00dfe 19-23", + "postal_code": "72074", + "city": "T\u00fcbingen", + "latitude": "48.527037", + "longitude": "9.062002", + "type_status": "Certified", + "administrative_contact": 1, + "technical_contact": 1, + "website_url": "http://www.sfs.uni-tuebingen.de/", + "description": "", + "expertise": "corpus linguistics", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-87", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/03/T\u00fcbingen-CLARIN-D-Repository.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "None.", + "repository_system": "Fedora Commons", + "strict_versioning": true, + "type": [1], + "assessmentdates": [16], + "monitoring_contacts": [56, 55] + } + }, { + "model": "centre_registry.centre", + "pk": 49, + "fields": { + "name": "Eurac Research CLARIN Centre", + "shorthand": "ERCC", + "organisation_name": "Eurac Research", + "institution": "Institute for Applied Linguistics", + "working_unit": "Institute for Applied Linguistics", + "address": "Viale Druso, 1 / Drususallee 1", + "postal_code": "39100", + "city": "Bolzano / Bozen", + "latitude": "46.4892648", + "longitude": "11.3296052", + "type_status": "Aiming for B", + "administrative_contact": 102, + "technical_contact": 103, + "website_url": "https://clarin.eurac.edu/", + "description": "Storing and providing data collected at the Institute for Applied Linguistics at Eurac Research and beyond.", + "expertise": "Specialised in CMC and Language Learner Data", + "consortium": 21, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "Handles (prefix 20.500.12124)", + "long_term_archiving_policy": "", + "repository_system": "CLARIN DSpace", + "strict_versioning": true, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [103] + } + }, { + "model": "centre_registry.centre", + "pk": 17, + "fields": { + "name": "The Language Bank of Finland", + "shorthand": "FIN-CLARIN", + "organisation_name": "The Department of Digital Humanities", + "institution": "University of Helsinki (UHEL) ", + "working_unit": "FIN-CLARIN", + "address": "Unioninkatu 40", + "postal_code": "00170", + "city": "Helsinki", + "latitude": "60.172839", + "longitude": "24.950366", + "type_status": "Certified", + "administrative_contact": 29, + "technical_contact": 105, + "website_url": "http://www.kielipankki.fi/", + "description": "Offers content for download as well as web services", + "expertise": "digital humanities, computational linguistics, corpora", + "consortium": 7, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-135", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/04/The-Language-Bank-of-Finland.pdf", + "pid_status": "URN:NBN and Handle (prefix: 11113).", + "long_term_archiving_policy": "", + "repository_system": "LAT", + "strict_versioning": false, + "type": [1], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 7, + "fields": { + "name": "Forschungzentrum J\u00fclich", + "shorthand": "FZJ", + "organisation_name": "http://www.fz-juelich.de/ias/jsc/EN/Home/home_node.html", + "institution": "Institute for Advanced Simulation", + "working_unit": "J\u00fclich Supercomputing Centre", + "address": "IEK-7, Wilhelm-Johnen-Stra\u00dfe", + "postal_code": "52428", + "city": "J\u00fclich", + "latitude": "50.905369", + "longitude": "6.404938", + "type_status": "", + "administrative_contact": 12, + "technical_contact": 12, + "website_url": "http://www.fz-juelich.de/ias/jsc/EN/Home/home_node.html", + "description": "", + "expertise": "web service monitoring", + "consortium": 1, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [2], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 60, + "fields": { + "name": "Georg Eckert Institute for International Textbook Research", + "shorthand": "GEI", + "organisation_name": "Georg Eckert Institute for International Textbook Research", + "institution": "Georg Eckert Institute for International Textbook Research", + "working_unit": "Digital Information and Research Infrastructure Department (DIRI)", + "address": "Celler Stra\u00dfe 3", + "postal_code": "38114", + "city": "Braunschweig", + "latitude": "52.268450", + "longitude": "10.511086", + "type_status": "", + "administrative_contact": 123, + "technical_contact": 123, + "website_url": "http://worldviews.gei.de", + "description": "", + "expertise": "", + "consortium": null, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [123] + } + }, { + "model": "centre_registry.centre", + "pk": 8, + "fields": { + "name": "Gesellschaft f\u00fcr wissenschaftliche Datenverarbeitung G\u00f6ttingen", + "shorthand": "GWDG", + "organisation_name": "Gesellschaft f\u00fcr wissenschaftliche Datenverarbeitung G\u00f6ttingen", + "institution": "Gesellschaft f\u00fcr wissenschaftliche Datenverarbeitung G\u00f6ttingen", + "working_unit": "Gesellschaft f\u00fcr wissenschaftliche Datenverarbeitung G\u00f6ttingen", + "address": "Am Fa\u00dfberg 11", + "postal_code": "37077", + "city": "G\u00f6ttingen", + "latitude": "51.562323", + "longitude": "9.970938", + "type_status": "", + "administrative_contact": 13, + "technical_contact": 13, + "website_url": "http://www.gwdg.de/", + "description": "", + "expertise": "PIDs/EPIC, hosting, data storage", + "consortium": 1, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [2], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 9, + "fields": { + "name": "Hamburger Zentrum f\u00fcr Sprachkorpora", + "shorthand": "HZSK", + "organisation_name": "Hamburger Zentrum f\u00fcr Sprachkorpora", + "institution": "Universit\u00e4t Hamburg", + "working_unit": "HZSK CLARIN D-Centre", + "address": "Max-Brauer-Allee 60", + "postal_code": "22765", + "city": "Hamburg", + "latitude": "53.553724", + "longitude": "9.939426", + "type_status": "Certified", + "administrative_contact": 14, + "technical_contact": 15, + "website_url": "http://www.corpora.uni-hamburg.de/", + "description": "", + "expertise": "linguistic tools, multimedia and multimodal data, spoken language corpora", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-94", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/02/HZSK-Repository.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "", + "repository_system": "Fedora Commons", + "strict_versioning": false, + "type": [1], + "assessmentdates": [13], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 21, + "fields": { + "name": "Huygens ING", + "shorthand": "Huygens", + "organisation_name": "Huygens Institute for the History of the Netherlands", + "institution": "Royal Dutch Academy of Sciences", + "working_unit": "Huygens ING", + "address": "Prins Willem-Alexanderhof 5", + "postal_code": "2595 BE", + "city": "Den Haag", + "latitude": "52.081972", + "longitude": "4.327001", + "type_status": "", + "administrative_contact": 63, + "technical_contact": 63, + "website_url": "http://www.huygens.knaw.nl/", + "description": "", + "expertise": "political and institutional history, literature and letters, history of science", + "consortium": 10, + "type_certificate_url": "", + "dsa_url": "https://assessment.datasealofapproval.org/assessment_124/seal/html/", + "pid_status": "Handle (own server and prefix: 11240).", + "long_term_archiving_policy": "", + "repository_system": "Custom.", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 11, + "fields": { + "name": "Leibniz-Institut f\u00fcr Deutsche Sprache", + "shorthand": "IDS", + "organisation_name": "Leibniz-Institut f\u00fcr Deutsche Sprache", + "institution": "Leibniz Gemeinschaft", + "working_unit": "Leibniz-Institut f\u00fcr Deutsche Sprache", + "address": "R5 6-13", + "postal_code": "68161", + "city": "Mannheim", + "latitude": "49.488122", + "longitude": "8.472311", + "type_status": "Certified", + "administrative_contact": 18, + "technical_contact": 19, + "website_url": "http://www.ids-mannheim.de/", + "description": "Providing long-term storage of Germanic language resources.", + "expertise": "language archives, linguistic tools, long-term preservation, multimedia and multimodal data", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-90", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/03/IDS-Repository.pdf", + "pid_status": "Handle (own server and prefix: 10932).", + "long_term_archiving_policy": "", + "repository_system": "Fedora Commons", + "strict_versioning": true, + "type": [1], + "assessmentdates": [15], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 34, + "fields": { + "name": "The ILC4CLARIN Centre at the Institute for Computational Linguistics", + "shorthand": "ILC4CLARIN", + "organisation_name": "Institute for Computational Linguistics \"Antonio Zampolli\"", + "institution": "National Research Council", + "working_unit": "Institute for Computational Linguistics \"Antonio Zampolli\"", + "address": "Via Moruzzi, 1", + "postal_code": "56124", + "city": "Pisa", + "latitude": "43.7185511", + "longitude": "10.4222234", + "type_status": "Certified", + "administrative_contact": 64, + "technical_contact": 65, + "website_url": "https://ilc4clarin.ilc.cnr.it/", + "description": "", + "expertise": "computational linguistics, language resources, natural language processing, linguistic annotation, information extracion", + "consortium": 21, + "type_certificate_url": "", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/04/The-ILC4CLARIN-Centre-at-the-Institute-for-Computational-Linguistics-.pdf", + "pid_status": "handle", + "long_term_archiving_policy": "", + "repository_system": "Dspace", + "strict_versioning": false, + "type": [1], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 52, + "fields": { + "name": "IMPACT Centre of Competence", + "shorthand": "IMPACT-CKC", + "organisation_name": "IMPACT Centre of Competence", + "institution": "Fundaci\u00f3n Biblioteca Virtual Miguel de Cervantes", + "working_unit": "Fundaci\u00f3n Biblioteca Virtual Miguel de Cervantes", + "address": "Paseo de la Castellana 103", + "postal_code": "28046", + "city": "Madrid", + "latitude": "40.42998", + "longitude": "3.68914", + "type_status": "Certified", + "administrative_contact": 112, + "technical_contact": 112, + "website_url": "http://www.digitisation.eu", + "description": "Digitisation of historical text including the whole digitisation workflow: image pre-processing, OCR, post-correction, etc.", + "expertise": "", + "consortium": null, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-148", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [3], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 10, + "fields": { + "name": "Institut f\u00fcr Maschinelle Sprachverarbeitung", + "shorthand": "IMS", + "organisation_name": "Institut f\u00fcr Maschinelle Sprachverarbeitung", + "institution": "Universit\u00e4t Stuttgart", + "working_unit": "Lehrstuhl Grundlagen der Computerlinguistik", + "address": "Pfaffenwaldring 5b", + "postal_code": "70569", + "city": "Stuttgart", + "latitude": "48.746750", + "longitude": "9.108240", + "type_status": "Certified", + "administrative_contact": 16, + "technical_contact": 81, + "website_url": "http://www.ims.uni-stuttgart.de/", + "description": "", + "expertise": "linguistically annotated corpora, corpus tools, tools and methods for robust morphological and syntactic analysis of multiple languages.", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-95", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/12/IMS-Repository.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "", + "repository_system": "Fedora Commons", + "strict_versioning": false, + "type": [1], + "assessmentdates": [4], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 22, + "fields": { + "name": "Instituut voor de Nederlandse Taal", + "shorthand": "IVDNT", + "organisation_name": "Instituut voor de Nederlandse Taal", + "institution": "Instituut voor de Nederlandse Taal", + "working_unit": "Instituut voor de Nederlandse Taal", + "address": "PO Box 9515", + "postal_code": "2300 RA", + "city": "Leiden", + "latitude": "52.1578635", + "longitude": "4.4851714", + "type_status": "Certified", + "administrative_contact": 93, + "technical_contact": 37, + "website_url": "http://ivdnt.org/", + "description": "", + "expertise": "knowledge bank of Dutch vocabulary, the Dutch Language Database, large Dutch corpora, all areas of Dutch linguistics, corpus linguistics, language resource standards, Toolbox for Lexicon Building", + "consortium": 10, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-107", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/12/CLARIN-Center-IvdNT.pdf", + "pid_status": "Handle (own server and prefix: 10032).", + "long_term_archiving_policy": "", + "repository_system": "DSpace", + "strict_versioning": false, + "type": [1], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 67, + "fields": { + "name": "CLARIN Knowledge Centre for Belarusian text and speech processing", + "shorthand": "K-BLP", + "organisation_name": "United Institute of Informatics Problems", + "institution": "National Academy of Sciences of Belarus", + "working_unit": "Speech Synthesis and Recognition Laboratory", + "address": "\u0432\u0443\u043b\u0456\u0446\u0430 \u0421\u0443\u0440\u0433\u0430\u043d\u0430\u0432\u0430 6", + "postal_code": "220012", + "city": "Minsk", + "latitude": "53.9200628", + "longitude": "27.6033", + "type_status": "Certified K", + "administrative_contact": 135, + "technical_contact": 137, + "website_url": "https://clarin-belarus.corpus.by", + "description": "", + "expertise": "", + "consortium": null, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-159", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3, 5], + "assessmentdates": [22], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 47, + "fields": { + "name": "Language Archive Cologne", + "shorthand": "LAC", + "organisation_name": "Data Centre for the Humanities", + "institution": "University of Cologne", + "working_unit": "Faculty of Arts and Humanities", + "address": "Data Center for the Humanities / Universit\u00e4t zu K\u00f6ln / Albertus-Magnus-Platz", + "postal_code": "D-50923 ", + "city": "Cologne", + "latitude": "50.928130", + "longitude": "6.928546", + "type_status": "Aiming for B", + "administrative_contact": 99, + "technical_contact": 99, + "website_url": "https://lac.uni-koeln.de", + "description": "The Language Archive Cologne (LAC) supports research, learning and teaching with high quality and dependable digital language repository", + "expertise": "audio-visual speech data, linguistic fieldwork, linguistic diversity, and language documentation", + "consortium": 1, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "Handle / 11341", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": true, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [99] + } + }, { + "model": "centre_registry.centre", + "pk": 3, + "fields": { + "name": "LINDAT/CLARIN", + "shorthand": "LINDAT", + "organisation_name": "\u00daFAL MFF UK - Linguistics", + "institution": "Univerzita Karlova v Praze", + "working_unit": "LINDAT/CLARIN", + "address": "Malostransk\u00e9 n\u00e1m\u011bst\u00ed 25", + "postal_code": "118 00", + "city": "Praha", + "latitude": "50.088521", + "longitude": "14.403265", + "type_status": "Certified", + "administrative_contact": 4, + "technical_contact": 5, + "website_url": "http://lindat.mff.cuni.cz/", + "description": "", + "expertise": "computational linguistics, language resources, natural language processing, linguistic annotation, machine translation, machine learning, information retrieval, tagging/parsing, semantics", + "consortium": 3, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-99", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/08/LINDAT-CLARIN.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "", + "repository_system": "DSpace", + "strict_versioning": false, + "type": [1], + "assessmentdates": [17], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 23, + "fields": { + "name": "Meertens Instituut/HuC", + "shorthand": "MI", + "organisation_name": "Meertens Instituut/HuC", + "institution": "Royal Netherlands Academy of Arts and Sciences", + "working_unit": "Meertens Instituut/HuC", + "address": "Oudezijds Achterburgwal 185", + "postal_code": "1012 DK", + "city": "Amsterdam", + "latitude": "52.370616", + "longitude": "4.896523", + "type_status": "Certified", + "administrative_contact": 44, + "technical_contact": 131, + "website_url": "http://www.meertens.knaw.nl/", + "description": "", + "expertise": "language variation and culture", + "consortium": 10, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-106", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2018/03/Meertens-Institute.pdf", + "pid_status": "Handle via EPIC (own prefix: 10744).", + "long_term_archiving_policy": "Providing long-term archiving services.", + "repository_system": "Custom.", + "strict_versioning": false, + "type": [1], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 44, + "fields": { + "name": "Mediterranean Research Centre for the Humanities' Phonoth\u00e8que", + "shorthand": "MMSH's Phonoth\u00e8que", + "organisation_name": "Ma", + "institution": "Maison M\u00e9diterran\u00e9enne des Sciences de l\u2019Homme - Aix-Marseille University / CNRS", + "working_unit": "N/A", + "address": "5 rue du ch\u00e2teau de l'horloge", + "postal_code": "BP 647 13094 Cedex 2", + "city": "Aix-en-Provence", + "latitude": "43.535070", + "longitude": "5.446179", + "type_status": "", + "administrative_contact": 91, + "technical_contact": 91, + "website_url": "http://phonotheque.mmsh.huma-num.fr", + "description": "The collection held more than 8000 hours of audio archives recorded since the late 1950s concerning all the humanities sciences.", + "expertise": "Intangible heritage", + "consortium": 23, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 12, + "fields": { + "name": "Max Planck Computing and Data Facility", + "shorthand": "MPCDF", + "organisation_name": "Rechenzentrum der Max-Planck-Gesellschaft am Max-Planck-Institut f\u00fcr Plasmaphysik ", + "institution": "Max Planck Gesellschaft", + "working_unit": "RZG", + "address": "Boltzmannstra\u00dfe 2", + "postal_code": "85748", + "city": "Garching", + "latitude": "48.263477", + "longitude": "11.671697", + "type_status": "", + "administrative_contact": 20, + "technical_contact": 21, + "website_url": "http://www.rzg.mpg.de/", + "description": "", + "expertise": "data storage", + "consortium": 1, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [2], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 24, + "fields": { + "name": "MPI for Psycholinguistics", + "shorthand": "MPI-PL", + "organisation_name": "Max Planck Institute for Psycholinguistics", + "institution": "Max Planck Gesellschaft (MPG)", + "working_unit": "The Language Archive", + "address": "Wundtlaan 1", + "postal_code": "6525 XD", + "city": "Nijmegen", + "latitude": "51.818046", + "longitude": "5.857079", + "type_status": "Certified", + "administrative_contact": 97, + "technical_contact": 76, + "website_url": "https://archive.mpi.nl", + "description": "", + "expertise": "language archives, linguistic tools, long-term preservation, multimedia and multimodal data", + "consortium": 10, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-91", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/01/The-Language-Archive.pdf", + "pid_status": "Handle (own server and prefix: 1839).", + "long_term_archiving_policy": "Long-term preservation.", + "repository_system": "FLAT (Fedora Commons based)", + "strict_versioning": true, + "type": [1], + "assessmentdates": [9], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 31, + "fields": { + "name": "National Library of Norway", + "shorthand": "NB.NO", + "organisation_name": "Spr\u00e5kbanken (Speech & Language Data Bank)", + "institution": "National Library of Norway", + "working_unit": "NB-CLARINO + Spr\u00e5kbanken", + "address": "2674 Solli", + "postal_code": "0203", + "city": "Oslo", + "latitude": "59.9148493", + "longitude": "10.718983", + "type_status": "Aiming for A+B", + "administrative_contact": 58, + "technical_contact": 58, + "website_url": "http://nb.no/clarino", + "description": "Providing metadata services and access to language resources", + "expertise": "", + "consortium": 14, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "URN:NBN and Assigned Handle prefix: 11538", + "long_term_archiving_policy": "", + "repository_system": "Custom", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 70, + "fields": { + "name": "NLP:EL - CLARIN Knowledge Centre for Natural Language Processing in Greece", + "shorthand": "NLP:EL", + "organisation_name": "ATHENA RC / ILSP", + "institution": "Institute for Language and Speech Processing (ILSP)", + "working_unit": "Natural Language Processing and Knowledge Extraction Unit", + "address": "Artemidos 6 & Epidavrou", + "postal_code": "15125", + "city": "Maroussi", + "latitude": "38.0550", + "longitude": "23.8077", + "type_status": "Certified", + "administrative_contact": 141, + "technical_contact": 142, + "website_url": "https://www.clarin.gr/en/kcentre/helpdesk", + "description": "Supports NLP research for Greek and digital readiness of Greek. NLP:EL provides educational material and guidance for NLP tools.", + "expertise": "Information, Communication and Knowledge Technologies, Language Technology and Cultural Technology", + "consortium": 19, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-158", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [21], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 63, + "fields": { + "name": "CLARIN Knowledge Centre for Data Management at NSD", + "shorthand": "NSD", + "organisation_name": "Norwegian Centre for Research Data (NSD)", + "institution": "Norwegian Centre for Research Data (NSD)", + "working_unit": "Norwegian Centre for Research Data (NSD)", + "address": "Harald H\u00e5rfagres gate 29", + "postal_code": "N-5007", + "city": "Bergen", + "latitude": "60.2786821", + "longitude": "6.1084835", + "type_status": "Certified", + "administrative_contact": 126, + "technical_contact": 126, + "website_url": "https://nsd.no/clarin-k-centre", + "description": "CLARIN Knowledge Centre (K-centre) in the areas of data management and legal and ethical issues", + "expertise": "Provides expertise in data management, including legal and ethical issues related to privacy and IPR.", + "consortium": null, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-146", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [3], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 26, + "fields": { + "name": "Oxford Text Archive", + "shorthand": "OTA", + "organisation_name": "Oxford Text Archive", + "institution": "University of Oxford", + "working_unit": "Bodleian Libraries", + "address": "Osney One Building, Osney Mead", + "postal_code": "OX2 0EW", + "city": "Oxford", + "latitude": "51.757571", + "longitude": "-1.261824", + "type_status": "Aiming for B.", + "administrative_contact": 42, + "technical_contact": 42, + "website_url": "http://ota.ahds.ac.uk/", + "description": "", + "expertise": "TEI, text corpora, historic texts", + "consortium": 12, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "Custom.", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 66, + "fields": { + "name": "PORTULAN CLARIN K-Centre: CLARIN Knowledge Centre for the Science and Technology of the Portuguese Language", + "shorthand": "PORTULAN CLARIN K-Centre", + "organisation_name": "Universidade de Lisboa", + "institution": "Faculdade de Ci\u00eancias da Universidade de Lisboa", + "working_unit": "Departamento de Inform\u00e1tica", + "address": "Departamento de Inform\u00e1tica - FCUL, Edif\u00edcio C6 - piso 3, Campo Grande", + "postal_code": "1749-016", + "city": "Lisboa", + "latitude": "38.7557098", + "longitude": "-9.1578508", + "type_status": "Certified", + "administrative_contact": 133, + "technical_contact": 109, + "website_url": "https://portulanclarin.net/k-centre/", + "description": "The Science and Technology of the Portuguese Language is the thematic area of this CLARIN Knowledge Centre.", + "expertise": "Related to the Portuguese language, it covers all topics from Phonetics to Discourse and Dialogue; considering all language functions from communicative performance to cultural expression; approached", + "consortium": 16, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-140", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [18], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 50, + "fields": { + "name": "PORTULAN CLARIN Research Infrastructure for the Science and Technology of Language", + "shorthand": "PORTULAN-CLARIN", + "organisation_name": "Universidade de Lisboa", + "institution": "Faculdade de Ci\u00eancias da Universidade de Lisboa", + "working_unit": "Departamento de Inform\u00e1tica", + "address": "Departamento de Inform\u00e1tica - FCUL, Edif\u00edcio C6 - piso 3, Campo Grande", + "postal_code": "1749-016", + "city": "Lisboa", + "latitude": "38.7557098", + "longitude": "-9.1578508", + "type_status": "Certified", + "administrative_contact": 147, + "technical_contact": 109, + "website_url": "https://portulanclarin.net/", + "description": "", + "expertise": "science and technology of language, digital humanities, artificial intelligence, cognitive science", + "consortium": 16, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-157", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/12/PORTULAN-CLARIN.pdf", + "pid_status": "Handle via EPIC (prefix: 21.11115)", + "long_term_archiving_policy": "", + "repository_system": "META-SHARE derivative", + "strict_versioning": true, + "type": [1], + "assessmentdates": [20], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 41, + "fields": { + "name": "Phonogrammarchiv Institute for audiovisual Research and Documentation", + "shorthand": "Phonogrammarchiv", + "organisation_name": "Austrian Academy of Sciences", + "institution": "Phonogrammarchiv", + "working_unit": "Phonogrammarchiv", + "address": "Liebeiggasse 5", + "postal_code": "1010", + "city": "Vienna", + "latitude": "48.2133217", + "longitude": "16.3553763", + "type_status": "Certified", + "administrative_contact": 145, + "technical_contact": 86, + "website_url": "http://www.phonogrammarchiv.at/wwwnew/clarin-knowledge-centre.htm", + "description": "The Phonogrammarchiv is a multi-disciplinary research sound and video archive, covering holdings from all continents.", + "expertise": "recordings in the fields of cultural and social anthropology, ethnomusicology, musicology, linguistics, sociology, zoology, medi", + "consortium": 2, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-113", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 38, + "fields": { + "name": "PolMine Project", + "shorthand": "PolMine", + "organisation_name": "University of Duisburg-Essen", + "institution": "University of Duisburg-Essen", + "working_unit": "Faculty of the Social Sciences", + "address": "Lotharstr. 57", + "postal_code": "47057", + "city": "Duisburg", + "latitude": "51.4294378", + "longitude": "6.7993325", + "type_status": "", + "administrative_contact": 74, + "technical_contact": 75, + "website_url": "http://www.polmine.de", + "description": "", + "expertise": "The specialization of the centre are politically relevant texts in the public domain, particularly corpora of plenary protocols. ", + "consortium": null, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 58, + "fields": { + "name": "South African Centre for Digital Language Resources", + "shorthand": "SADiLaR", + "organisation_name": "South African Centre for Digital Language Resources", + "institution": "North-West University", + "working_unit": "Faculty of Humanities", + "address": "11 Hoffman street, Potchefstroom, 2531, South Africa", + "postal_code": "2531", + "city": "Potchefstroom", + "latitude": "-26.6906464", + "longitude": "27.0907368", + "type_status": "Aiming for B", + "administrative_contact": 118, + "technical_contact": 134, + "website_url": "https://www.sadilar.org/", + "description": "", + "expertise": "", + "consortium": 27, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "Handle (own server and prefix: 20.500.12185)", + "long_term_archiving_policy": "", + "repository_system": "DSpace", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 69, + "fields": { + "name": "CLARIN Knowledge Centre for Systems and Frameworks for Morphologically Rich Languages", + "shorthand": "SAFMORIL", + "organisation_name": "University of Helsinki", + "institution": "University of Helsinki", + "working_unit": "Department of Digital Humanities", + "address": "P.O. Box 4", + "postal_code": "00014", + "city": "Helsinki", + "latitude": "60.172839", + "longitude": "24.950366", + "type_status": "Certified", + "administrative_contact": 29, + "technical_contact": 140, + "website_url": "https://www.kielipankki.fi/safmoril/", + "description": "", + "expertise": "SAFMORIL brings together researchers and developers in the area of computational morphology and its application during language processing.", + "consortium": 7, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-156", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [18], + "monitoring_contacts": [140] + } + }, { + "model": "centre_registry.centre", + "pk": 28, + "fields": { + "name": "Speech & Language Data Repository", + "shorthand": "SLDR", + "organisation_name": "Laboratoire Parole et Langage (LPL)", + "institution": "Centre National de la Recherche Scientifique (CNRS) and Aix-Marseille University", + "working_unit": "UMR 7309", + "address": "5 avenue Pasteur", + "postal_code": "13100", + "city": "Aix-en-Provence", + "latitude": "43.535070", + "longitude": "5.446179", + "type_status": "", + "administrative_contact": 45, + "technical_contact": 46, + "website_url": "http://sldr.org/", + "description": "SLDR is a repository for oral and linguistic resources aimed at their long-term preservation and sharing", + "expertise": "oral and linguistic resources", + "consortium": 23, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "Handle", + "long_term_archiving_policy": "", + "repository_system": "Self-developed", + "strict_versioning": true, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 56, + "fields": { + "name": "CLARIN Knowledge Centre for The Languages of Sweden", + "shorthand": "SWELANG", + "organisation_name": "Spr\u00e5kr\u00e5det (The Language Council of Sweden)", + "institution": "Institute of Language and Folklore in Stockholm, Uppsala and G\u00f6teborg", + "working_unit": "Spr\u00e5kr\u00e5det (The Language Council of Sweden)", + "address": "Box 20057", + "postal_code": "SE-104 60", + "city": "Stockholm", + "latitude": "59.334591", + "longitude": "18.063240", + "type_status": "Certified", + "administrative_contact": 116, + "technical_contact": 116, + "website_url": "https://sweclarin.se/eng/centers/sprakradet", + "description": "Information service offering advice on the use of digital language resources and tools for the Swedish language, minority languages in Sweden, Swedish sign language, Swedish dialects et al.", + "expertise": "language technology, corpus linguistics and language counseling", + "consortium": 20, + "type_certificate_url": "https://hdl.handle.net/11372/DOC-136", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [10], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 32, + "fields": { + "name": "Spanish CLARIN K-Centre", + "shorthand": "Spanish K-Centre", + "organisation_name": "Spanish consortium, coordinator: CLARIN Competence Centre IULA-UPF and HDLab@UPF", + "institution": "Universitat Pompeu Fabra", + "working_unit": "Department of Humanities", + "address": "Ramon Trias Fargas, 25-27", + "postal_code": "08005", + "city": "Barcelona", + "latitude": "41.3895849", + "longitude": "2.1914035", + "type_status": "Certified (K)", + "administrative_contact": 59, + "technical_contact": 59, + "website_url": "http://clarin-es-lab.org", + "description": "Distributed CLARIN K Centre consisting of CLARIN Competence Center IULA-UPF and HDLab@UPF-Department of Humanities (Barcelona), UNED \u2013 LINHD: Laboratorio de innovaci\u00f3n en Humanidades Digitales (Madrid), and UPV \u2013 Grupo IXA (San Sebasti\u00e1n).", + "expertise": "Services to researchers working with Spanish texts and, additionally, IXA can afford experience in handling Basque texts and IULA-UPF-CCC Catalan texts.", + "consortium": null, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-111", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 37, + "fields": { + "name": "Spr\u00e5kbanken, The Swedish language bank", + "shorthand": "Sprakbanken", + "organisation_name": "Spr\u00e5kbanken", + "institution": "University of Gothenburg", + "working_unit": "Department of Swedish", + "address": "Box 200", + "postal_code": "405 30", + "city": "G\u00f6teborg", + "latitude": "57.6938986", + "longitude": "11.9815064", + "type_status": "Certified", + "administrative_contact": 72, + "technical_contact": 73, + "website_url": "http://spraakbanken.gu.se", + "description": "To offer repository, resources and services. Additional A-services and virtual K-centre.", + "expertise": "", + "consortium": 20, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-141", + "dsa_url": "https://assessment.datasealofapproval.org/assessment_208/seal/html/", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "LINDAT-Dspace", + "strict_versioning": true, + "type": [1, 5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 71, + "fields": { + "name": "TextGrid Repository", + "shorthand": "TGrep", + "organisation_name": "DARIAH-DE", + "institution": "Georg-August-Universit\u00e4t G\u00f6ttingen", + "working_unit": "Nieders\u00e4chsische Staats- und Universit\u00e4tsbibliothek G\u00f6ttingen", + "address": "Platz der G\u00f6ttinger Sieben 1", + "postal_code": "37073", + "city": "G\u00f6ttingen", + "latitude": "51.54016", + "longitude": "9.93405", + "type_status": "", + "administrative_contact": 143, + "technical_contact": 144, + "website_url": "https://textgridrep.org/", + "description": "The TextGrid Repository offers an extensive searchable and adaptable corpus of XML/TEI encoded texts and images.", + "expertise": "", + "consortium": null, + "type_certificate_url": "", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2020/05/TextGrid-Repository.pdf", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 48, + "fields": { + "name": "The Troms\u00f8 Repository of Language and Linguistics", + "shorthand": "TROLLing", + "organisation_name": "UiT The Arctic University of Norway", + "institution": "UiT The Arctic University of Norway", + "working_unit": "University Library", + "address": "Postboks 6050 Langnes 9037 Troms\u00f8", + "postal_code": "9037", + "city": "Troms\u00f8", + "latitude": "69.6798027", + "longitude": "18.9712161", + "type_status": "", + "administrative_contact": 101, + "technical_contact": 100, + "website_url": "https://trolling.uit.no/", + "description": "The Troms\u00f8 Repository of Language and Linguistics (TROLLing) is a repository of linguistic data, (statistical) code, and other related materials. The repository is open access, which means that all information is available to everyone. All postings are accompanied by searchable metadata that identify the researchers, the languages and linguistic phenomena involved, the statistical methods applied, and scholarly publications based on the data (where relevant).", + "expertise": "", + "consortium": 14, + "type_certificate_url": "", + "dsa_url": "http://site.uit.no/dataverseno/2020/03/30/dataverseno-is-coretrustseal-certified/", + "pid_status": "DOI", + "long_term_archiving_policy": "", + "repository_system": "Dataverse", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 55, + "fields": { + "name": "CLARIN Knowledge Center for Terminology Resources and Translation Corpora (TRTC)", + "shorthand": "TRTC", + "organisation_name": "University of Vienna, Centre for Translation Studies", + "institution": "University of Vienna", + "working_unit": "Centre for Translation Studies", + "address": "Gymnasiumstra\u00dfe 50", + "postal_code": "1190", + "city": "Vienna", + "latitude": "48.209117", + "longitude": "16.377080", + "type_status": "Certified", + "administrative_contact": 115, + "technical_contact": 115, + "website_url": "https://trtc.univie.ac.at", + "description": "Helpdesk, material and training about the preparation and documentation on terminology resources and translation corpora", + "expertise": "", + "consortium": 2, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-150", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [5], + "assessmentdates": [8], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 43, + "fields": { + "name": "CLARINO Text Laboratory Centre", + "shorthand": "TextLab", + "organisation_name": "The Text laboratory", + "institution": "University of Oslo", + "working_unit": "Department of Lingusitics and Scandinavian Studies", + "address": "P.O. Box 1102 Blindern ", + "postal_code": "0317", + "city": "Oslo", + "latitude": "59.9421883", + "longitude": "10.722153", + "type_status": "", + "administrative_contact": 88, + "technical_contact": 89, + "website_url": "http://www.tekstlab.uio.no/clarino/", + "description": "Goal: We want our linguistic resources (corpora, tools and word lists) to be available to the CLARIN community.", + "expertise": "Language technology, corpora, grammatical tools and corpus tools", + "consortium": 14, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "We use the PID service from the National Library of Norway", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 16, + "fields": { + "name": "IULA-UPF-CC-CLARIN", + "shorthand": "UPF", + "organisation_name": "Institut Universitari de Ling\u00fc\u00edstica Aplicada", + "institution": "Universitat Pompeu Fabra", + "working_unit": "Institut Universitari de Ling\u00fc\u00edstica Aplicada", + "address": "Carrer de Roc Boronat, 138", + "postal_code": "08018", + "city": "Barcelona", + "latitude": "41.403964", + "longitude": "2.193464", + "type_status": "", + "administrative_contact": 28, + "technical_contact": 28, + "website_url": "http://services.iula.upf.edu/", + "description": "We offer harvestable machine readable metadata about resources and web services for text enrichement and quantitative text analysis services.", + "expertise": "", + "consortium": null, + "type_certificate_url": "", + "dsa_url": "", + "pid_status": "Handle (own server and prefix: 10230).", + "long_term_archiving_policy": "", + "repository_system": "DSpace", + "strict_versioning": false, + "type": [3], + "assessmentdates": [], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 13, + "fields": { + "name": "Universit\u00e4t des Saarlandes", + "shorthand": "UdS", + "organisation_name": "Fachrichtung 4.6", + "institution": "Universit\u00e4t des Saarlandes", + "working_unit": "Englische Sprach- und \u00dcbersetzungswissenschaft", + "address": "Campus A2.2", + "postal_code": "66123", + "city": "Saarbr\u00fccken", + "latitude": "49.256004", + "longitude": "7.039011", + "type_status": "Certified", + "administrative_contact": 22, + "technical_contact": 23, + "website_url": "http://fedora.clarin-d.uni-saarland.de/", + "description": "", + "expertise": "Creation and annotation of corpora, empirical corpus linguistics, language variation and register analysis.", + "consortium": 1, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-92", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/02/CLARIND-UDS.pdf", + "pid_status": "Handle via EPIC.", + "long_term_archiving_policy": "", + "repository_system": "Fedora Commons", + "strict_versioning": false, + "type": [1], + "assessmentdates": [11], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 65, + "fields": { + "name": "ZIM Centre for Information Modelling", + "shorthand": "ZIM", + "organisation_name": "Institute Centre for Information Modelling - Austrian Centre for Digital Humanities", + "institution": "University of Graz", + "working_unit": "Faculty for Arts and Humanities", + "address": "Elisabethstra\u00dfe 59/III", + "postal_code": "8010", + "city": "Graz", + "latitude": "47.064", + "longitude": "15.4399", + "type_status": "Certified", + "administrative_contact": 129, + "technical_contact": 130, + "website_url": "https://zim.uni-graz.at/", + "description": "The centre's focus is on applied research in the area of information and data processing in the humanities.", + "expertise": "Digital scholarly edition, long-term preservation, digital museology, semantic web technologies", + "consortium": 2, + "type_certificate_url": "http://hdl.handle.net/11372/DOC-160", + "dsa_url": "https://www.coretrustseal.org/wp-content/uploads/2019/04/GAMS-Geisteswissenschaftliches-Asset-Management-System.pdf", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [1], + "assessmentdates": [23], + "monitoring_contacts": [] + } + }, { + "model": "centre_registry.centre", + "pk": 42, + "fields": { + "name": "Lund University Humanities Lab", + "shorthand": "lundlab", + "organisation_name": "Lund University", + "institution": "Lund University Humanities Lab", + "working_unit": "Lund University Humanities Lab", + "address": "Box 201", + "postal_code": "SE - 221 00", + "city": "Lund", + "latitude": "55.7090161", + "longitude": "13.1974325", + "type_status": "Certified", + "administrative_contact": 87, + "technical_contact": 87, + "website_url": "http://www.humlab.lu.se/", + "description": "The Lund University Humanities Lab provides tools and expertise related to language archiving, corpus and (meta)data management.", + "expertise": "Multimodal and sensor-based methods, including EEG, eye-tracking, articulography, virtual reality, motion capture, av-recording", + "consortium": 20, + "type_certificate_url": "https://hdl.handle.net/11372/DOC-137", + "dsa_url": "", + "pid_status": "", + "long_term_archiving_policy": "", + "repository_system": "", + "strict_versioning": false, + "type": [3, 5], + "assessmentdates": [], + "monitoring_contacts": [] + } + }] \ No newline at end of file diff --git a/src/test/resources/centre-registry-Consortium.json b/src/test/resources/centre-registry-Consortium.json new file mode 100644 index 00000000..fcb0fa9f --- /dev/null +++ b/src/test/resources/centre-registry-Consortium.json @@ -0,0 +1 @@ +[{"model": "centre_registry.consortium", "pk": 2, "fields": {"country_code": "AT", "country_name": "Austria", "is_observer": false, "name": "CLARIAH-AT", "website_url": "http://digital-humanities.at", "alias": "at"}}, {"model": "centre_registry.consortium", "pk": 13, "fields": {"country_code": "BG", "country_name": "Bulgaria", "is_observer": false, "name": "CLaDA-BG", "website_url": "http://clada-bg.eu/", "alias": ""}}, {"model": "centre_registry.consortium", "pk": 3, "fields": {"country_code": "CZ", "country_name": "Czech Republic", "is_observer": false, "name": "LINDAT-CLARIN", "website_url": "http://www.lindat.cz/", "alias": "cz"}}, {"model": "centre_registry.consortium", "pk": 1, "fields": {"country_code": "DE", "country_name": "Germany", "is_observer": false, "name": "CLARIN-D", "website_url": "http://www.clarin-d.de/", "alias": "de"}}, {"model": "centre_registry.consortium", "pk": 4, "fields": {"country_code": "DK", "country_name": "Denmark", "is_observer": false, "name": "CLARIN-DK", "website_url": "https://clarin.dk/", "alias": "dk"}}, {"model": "centre_registry.consortium", "pk": 5, "fields": {"country_code": "EE", "country_name": "Estonia", "is_observer": false, "name": "CELR", "website_url": "http://keeleressursid.ee/en/", "alias": "ee"}}, {"model": "centre_registry.consortium", "pk": 7, "fields": {"country_code": "FI", "country_name": "Finland", "is_observer": false, "name": "FIN-CLARIN", "website_url": "https://www.kielipankki.fi/organization/", "alias": "fi"}}, {"model": "centre_registry.consortium", "pk": 23, "fields": {"country_code": "FR", "country_name": "France", "is_observer": true, "name": "Huma-Num", "website_url": "https://www.huma-num.fr/", "alias": "fr"}}, {"model": "centre_registry.consortium", "pk": 19, "fields": {"country_code": "GR", "country_name": "Greece", "is_observer": false, "name": "CLARIN-EL", "website_url": "http://www.clarin.gr/", "alias": "gr"}}, {"model": "centre_registry.consortium", "pk": 25, "fields": {"country_code": "HR", "country_name": "Croatia", "is_observer": false, "name": "HR\u2011CLARIN", "website_url": "", "alias": "hr"}}, {"model": "centre_registry.consortium", "pk": 26, "fields": {"country_code": "IS", "country_name": "Iceland", "is_observer": true, "name": "CLARIN-IS", "website_url": "http://clarin.is", "alias": ""}}, {"model": "centre_registry.consortium", "pk": 21, "fields": {"country_code": "IT", "country_name": "Italy", "is_observer": false, "name": "CLARIN-IT", "website_url": "http://www.clarin-it.it", "alias": "it"}}, {"model": "centre_registry.consortium", "pk": 15, "fields": {"country_code": "LT", "country_name": "Lithuania", "is_observer": false, "name": "CLARIN-LT", "website_url": "http://clarin-lt.lt/", "alias": "lt"}}, {"model": "centre_registry.consortium", "pk": 24, "fields": {"country_code": "LV", "country_name": "Latvia", "is_observer": false, "name": "CLARIN-LV", "website_url": "http://www.clarin.lv/lv/", "alias": "lv"}}, {"model": "centre_registry.consortium", "pk": 10, "fields": {"country_code": "NL", "country_name": "The Netherlands", "is_observer": false, "name": "CLARIAH", "website_url": "http://www.clariah.nl/", "alias": "nl"}}, {"model": "centre_registry.consortium", "pk": 14, "fields": {"country_code": "NO", "country_name": "Norway", "is_observer": false, "name": "CLARINO", "website_url": "http://clarino.uib.no/", "alias": "no"}}, {"model": "centre_registry.consortium", "pk": 11, "fields": {"country_code": "PL", "country_name": "Poland", "is_observer": false, "name": "CLARIN-PL", "website_url": "http://www.clarin-pl.eu/", "alias": "pl"}}, {"model": "centre_registry.consortium", "pk": 16, "fields": {"country_code": "PT", "country_name": "Portugal", "is_observer": false, "name": "PORTULAN CLARIN", "website_url": "https://portulanclarin.net", "alias": "pt"}}, {"model": "centre_registry.consortium", "pk": 20, "fields": {"country_code": "SE", "country_name": "Sweden", "is_observer": false, "name": "SWE-CLARIN", "website_url": "http://sweclarin.se/", "alias": "se"}}, {"model": "centre_registry.consortium", "pk": 18, "fields": {"country_code": "SI", "country_name": "Slovenia", "is_observer": false, "name": "CLARIN.SI", "website_url": "http://www.clarin.si/", "alias": "si"}}, {"model": "centre_registry.consortium", "pk": 12, "fields": {"country_code": "UK", "country_name": "United Kingdom", "is_observer": true, "name": "CLARIN-UK", "website_url": "http://www.clarin.ac.uk/", "alias": "uk"}}, {"model": "centre_registry.consortium", "pk": 27, "fields": {"country_code": "ZA", "country_name": "South Africa", "is_observer": true, "name": "SADiLaR", "website_url": "https://sadilar.org/", "alias": ""}}, {"model": "centre_registry.consortium", "pk": 22, "fields": {"country_code": "hu", "country_name": "Hungary", "is_observer": false, "name": "HunCLARIN", "website_url": "http://corpus.nytud.hu/hunclarin/", "alias": "hu"}}] \ No newline at end of file diff --git a/src/test/resources/centre-registry-OAIPMHEndpoint.json b/src/test/resources/centre-registry-OAIPMHEndpoint.json new file mode 100644 index 00000000..cc0fac71 --- /dev/null +++ b/src/test/resources/centre-registry-OAIPMHEndpoint.json @@ -0,0 +1,502 @@ +[ + { + "model": "centre_registry.oaipmhendpoint", + "pk": 20, + "fields": { + "centre": 14, + "uri": "http://clarin.dk/oaiprovider/", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 9, + "fields": { + "centre": 10, + "uri": "http://clarin04.ims.uni-stuttgart.de/oaiprovider/oai", + "note": "", + "oai_pmh_sets": [3] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 31, + "fields": { + "centre": 29, + "uri": "http://clarino.uib.no/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 38, + "fields": { + "centre": 40, + "uri": "http://clst.science.ru.nl/oai/provider", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 30, + "fields": { + "centre": 33, + "uri": "http://cocoon.huma-num.fr/crdo_servlet/oai-pmh", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 8, + "fields": { + "centre": 9, + "uri": "http://corpora.uni-hamburg.de:8080/oai/provider", + "note": "", + "oai_pmh_sets": [3] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 32, + "fields": { + "centre": 34, + "uri": "http://dspace-clarin-it.ilc.cnr.it/repository/oai/request", + "note": "", + "oai_pmh_sets": [2] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 6, + "fields": { + "centre": 13, + "uri": "http://fedora.clarin-d.uni-saarland.de/oaiprovider/", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 11, + "fields": { + "centre": 3, + "uri": "http://lindat.mff.cuni.cz/repository/oai/request", + "note": "", + "oai_pmh_sets": [8] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 17, + "fields": { + "centre": 17, + "uri": "http://metalb.csc.fi/cgi-bin/que", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 23, + "fields": { + "centre": 20, + "uri": "http://oai.clarin-beta.dans.knaw.nl/OAIHandler", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 18, + "fields": { + "centre": 18, + "uri": "http://oai.talkbank.org/oai/provider", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 10, + "fields": { + "centre": 21, + "uri": "http://oaipmh.huygens.knaw.nl/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 25, + "fields": { + "centre": 4, + "uri": "http://openscience.uni-leipzig.de/index.php/mr2/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 45, + "fields": { + "centre": 44, + "uri": "http://phonotheque.mmsh.huma-num.fr/oai/provider", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 35, + "fields": { + "centre": 38, + "uri": "http://polmine.sowi.uni-due.de:8080/oai/provider", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 4, + "fields": { + "centre": 11, + "uri": "http://repos.ids-mannheim.de/oaiprovider/", + "note": "", + "oai_pmh_sets": [4] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 43, + "fields": { + "centre": 14, + "uri": "http://repository.clarin.dk/repository/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 12, + "fields": { + "centre": 22, + "uri": "http://repository.clarin.inl.nl/oai/provider", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 26, + "fields": { + "centre": 28, + "uri": "http://sldr.org/oai-pmh.php", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 39, + "fields": { + "centre": 43, + "uri": "http://tekstlab.uio.no/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 54, + "fields": { + "centre": 71, + "uri": "http://textgridlab.org/1.0/tgoaipmh/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 47, + "fields": { + "centre": 60, + "uri": "http://worldviews.gei.de/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 24, + "fields": { + "centre": 16, + "uri": "http://ws02.iula.upf.edu/corpus_data/oai-iula/oai.pl", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 27, + "fields": { + "centre": 30, + "uri": "http://www.clarin.si/repository/oai/request", + "note": "", + "oai_pmh_sets": [6] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 13, + "fields": { + "centre": 23, + "uri": "http://www.meertens.knaw.nl/oai/oai_server.php", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 28, + "fields": { + "centre": 31, + "uri": "http://www.nb.no/clarino/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 3, + "fields": { + "centre": 5, + "uri": "http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl", + "note": "", + "oai_pmh_sets": [10,11] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 41, + "fields": { + "centre": 47, + "uri": "https://api.ka3.uni-koeln.de/oai/lac", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 5, + "fields": { + "centre": 24, + "uri": "https://archive.mpi.nl/oai2", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 52, + "fields": { + "centre": 67, + "uri": "https://clarin-belarus.corpus.by/provider", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 16, + "fields": { + "centre": 25, + "uri": "https://clarin-pl.eu/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 7, + "fields": { + "centre": 6, + "uri": "https://clarin.bbaw.de:8088/oaiprovider", + "note": "", + "oai_pmh_sets": [4] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 42, + "fields": { + "centre": 49, + "uri": "https://clarin.eurac.edu/repository/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 33, + "fields": { + "centre": 36, + "uri": "https://clarin.vdu.lt/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 2, + "fields": { + "centre": 4, + "uri": "https://clarinoai.informatik.uni-leipzig.de/oaiprovider/oai", + "note": "", + "oai_pmh_sets": [7] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 51, + "fields": { + "centre": 42, + "uri": "https://corpora.humlab.lu.se/ds/oaiprovider/oai2", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 49, + "fields": { + "centre": 65, + "uri": "https://gams.uni-graz.at/oaiprovider/", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 14, + "fields": { + "centre": 15, + "uri": "https://metashare.ut.ee/oai_pmh/", + "note": "", + "oai_pmh_sets": [1] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 36, + "fields": { + "centre": 35, + "uri": "https://oai.cedifor.de/", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 19, + "fields": { + "centre": 26, + "uri": "https://ota.bodleian.ox.ac.uk/repository/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 44, + "fields": { + "centre": 50, + "uri": "https://portulanclarin.net/repository/oaipmh/", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 29, + "fields": { + "centre": 29, + "uri": "https://repo.clarino.uib.no/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 46, + "fields": { + "centre": 58, + "uri": "https://repo.sadilar.org/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 34, + "fields": { + "centre": 37, + "uri": "https://repo.spraakbanken.gu.se/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 50, + "fields": { + "centre": 64, + "uri": "https://repository.clarin.is/repository/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 53, + "fields": { + "centre": 68, + "uri": "https://repository.clarin.lv/repository/oai/request", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 48, + "fields": { + "centre": 62, + "uri": "https://repository.de.dariah.eu/1.0/oaipmh/oai", + "note": "", + "oai_pmh_sets": [] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 21, + "fields": { + "centre": 1, + "uri": "https://talar.sfb833.uni-tuebingen.de/erdora/rest/oai", + "note": "", + "oai_pmh_sets": [4] + } + }, + { + "model": "centre_registry.oaipmhendpoint", + "pk": 37, + "fields": { + "centre": 23, + "uri": "https://www.meertens.knaw.nl/flat/oai2", + "note": "", + "oai_pmh_sets": [] + } + } +] \ No newline at end of file diff --git a/src/test/resources/centre-registry-OAIPMHEndpointSet.json b/src/test/resources/centre-registry-OAIPMHEndpointSet.json new file mode 100644 index 00000000..ac4209a9 --- /dev/null +++ b/src/test/resources/centre-registry-OAIPMHEndpointSet.json @@ -0,0 +1,71 @@ +[{ + "model": "centre_registry.oaipmhendpointset", + "pk": 3, + "fields": { + "set_spec": "", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 4, + "fields": { + "set_spec": "WebLichtWebServices", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 5, + "fields": { + "set_spec": "Weblicht", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 2, + "fields": { + "set_spec": "hdl_000-c0-111_78", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 6, + "fields": { + "set_spec": "hdl_11356_1077", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 8, + "fields": { + "set_spec": "hdl_11858_00-097C-0000-0023-8C33-2", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 7, + "fields": { + "set_spec": "oai:webservices", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 1, + "fields": { + "set_spec": "toolService:service:WebLichtWebService", + "set_type": "WebLicht" + } + }, { + "model": "centre_registry.oaipmhendpointset", + "pk": 10, + "fields": { + "set_spec": "test-set1", + "set_type": "Test1" + } + },{ + "model": "centre_registry.oaipmhendpointset", + "pk": 11, + "fields": { + "set_spec": "test-set2", + "set_type": "Test2" + } + }] diff --git a/src/test/resources/config/test-config-basic.xml b/src/test/resources/config/test-config-basic.xml index 695f702c..f676740f 100644 --- a/src/test/resources/config/test-config-basic.xml +++ b/src/test/resources/config/test-config-basic.xml @@ -9,8 +9,8 @@ 2 - - 10000 + + 10 4 diff --git a/src/test/resources/config/test-config-import.xml b/src/test/resources/config/test-config-import.xml index c9239b16..36351d0c 100644 --- a/src/test/resources/config/test-config-import.xml +++ b/src/test/resources/config/test-config-import.xml @@ -9,8 +9,8 @@ 2 - - 10000 + + 10 4