Commit fdbbcedb authored by Thibault Ehrhart's avatar Thibault Ehrhart

Initial implementation of BratAnnotator

parent c2652f8a
props/config.properties.example
# Created by .ignore support plugin (hsz.mobi)
### Maven template
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion
*.iml
## Directory-based project format:
.idea/
# if you remove the above rule, at least ignore the following:
# User-specific stuff:
# .idea/workspace.xml
# .idea/tasks.xml
# .idea/dictionaries
# Sensitive or high-churn files:
# .idea/dataSources.ids
# .idea/dataSources.xml
# .idea/sqlDataSources.xml
# .idea/dynamic.xml
# .idea/uiDesigner.xml
# Gradle:
# .idea/gradle.xml
# .idea/libraries
# Mongo Explorer plugin:
# .idea/mongoSettings.xml
## File-based project format:
*.ipr
*.iws
## Plugin-specific files:
# IntelliJ
/out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
### Java template
*.class
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.jar
*.war
*.ear
# Turtle files
*.ttl
#Log files
*.log
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
###! Exceptions
# Libs
!libs/*.jar
# BratAnnotator
Annotates RDF turtle files in `data/dump/agencefrancepresse` using brat annotation files in `data/resource/brat/agencefrancepresse`.
## Compilation
```
mvn -U clean package
```
## Usage
```
java -Dlogfile.name=asrael-bratannotator.log -Dlogfile.append=true -jar target/BratAnnotator-1.0-SNAPSHOT.jar
```
java -Dlogfile.name=asrael-bratannotator.log -Dlogfile.append=true -jar target/BRatAnnotator-1.0-SNAPSHOT.jar
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>fr.eurecom.asrael</groupId>
<artifactId>BratAnnotator</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>fr.eurecom.asrael.bratannotator.Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>oss-sonatype</id>
<name>oss-sonatype</name>
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>apache-jena-libs</artifactId>
<type>pom</type>
<version>LATEST</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>com.pengyifan.brat</groupId>
<artifactId>pengyifan-brat</artifactId>
<version>1.3.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
annotator.path=/Volumes/Data/Work/Projects/asrael/news-annotations/src
annotator.file=annotate_article.py
\ No newline at end of file
package fr.eurecom.asrael.bratannotator;
public class Main {
public static void main(final String... args) {
new BratAnnotator().run();
}
}
package fr.eurecom.asrael.bratannotator.datatypes;
import com.github.jsonldjava.core.RDFDataset;
import fr.eurecom.asrael.bratannotator.utils.UUIDGenerator;
import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.datatypes.xsd.XSDDatatype;
import org.apache.jena.rdf.model.*;
import org.apache.jena.vocabulary.*;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.Map;
public class Annotation {
private static final String BASE_URI = "http://asrael.eurecom.fr/annotation/";
private String body;
private String source;
private int startPosition;
private int endPosition;
private String canonicalURL;
public void setBody(String body) {
this.body = body;
}
public void setSource(String source) {
this.source = source;
}
public void setStartPosition(int startPosition) {
this.startPosition = startPosition;
}
public void setEndPosition(int endPosition) {
this.endPosition = endPosition;
}
/**
* Create the RDF model of the current annotation.
*
* @return A RDF model corresponding to an annotation
*/
public final Model getModel() {
final Model model = ModelFactory.createDefaultModel();
final Map<String, String> prefixes = new HashMap<>();
final LocalDateTime today = LocalDateTime.now();
prefixes.put("rdf", RDF.getURI());
prefixes.put("xsd", "http://www.w3.org/2001/XMLSchema#");
prefixes.put("oa", "http://www.w3.org/ns/oa#");
model.setNsPrefixes(prefixes);
model.add(ResourceFactory.createResource(
Annotation.BASE_URI + UUIDGenerator.generateUUID(this.canonicalURL)),
RDF.type,
ResourceFactory.createResource("http://www.w3.org/ns/oa#Annotation")
);
// Body
model.add(ResourceFactory.createResource(
Annotation.BASE_URI + UUIDGenerator.generateUUID(this.canonicalURL)),
ResourceFactory.createProperty("http://www.w3.org/ns/oa#bodyValue"),
ResourceFactory.createPlainLiteral(this.body)
);
// Target
Resource hasTarget = model.createResource();
// Target -> hasSource
hasTarget.addProperty(
ResourceFactory.createProperty("http://www.w3.org/ns/oa#hasSource"),
ResourceFactory.createResource(this.source)
);
// Target -> hasSelector
Resource hasSelector = model.createResource();
hasSelector.addProperty(
RDF.type,
ResourceFactory.createResource("http://www.w3.org/ns/oa#TextPositionSelector")
);
hasSelector.addProperty(
ResourceFactory.createProperty("http://www.w3.org/ns/oa#start"),
ResourceFactory.createTypedLiteral(String.valueOf(this.startPosition), XSDDatatype.XSDnonNegativeInteger)
);
hasSelector.addProperty(
ResourceFactory.createProperty("http://www.w3.org/ns/oa#end"),
ResourceFactory.createTypedLiteral(String.valueOf(this.endPosition), XSDDatatype.XSDnonNegativeInteger)
);
// Add hasSelector to target
hasTarget.addProperty(
ResourceFactory.createProperty("http://www.w3.org/ns/oa#hasSelector"),
hasSelector
);
// Add hasTarget to annotation
model.add(ResourceFactory.createResource(
Annotation.BASE_URI + UUIDGenerator.generateUUID(this.canonicalURL)),
ResourceFactory.createProperty("http://www.w3.org/ns/oa#hasTarget"),
hasTarget
);
return model;
}
public void setCanonicalURL(String canonicalURL) {
this.canonicalURL = canonicalURL;
}
}
package fr.eurecom.asrael.bratannotator.utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.bind.DatatypeConverter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.UUID;
/**
* @author Julien Plu
*/
public final class UUIDGenerator {
private static final Logger LOGGER = LoggerFactory.getLogger(UUIDGenerator.class);
private UUIDGenerator() {
}
/**
* Generate an UUID from a seed.
*
* @param seed The seed used to generate the UUID
* @return the generated UUID
*/
public static String generateUUID(final String seed) {
try {
final String hash = DatatypeConverter.printHexBinary(MessageDigest.getInstance(
"SHA-1").digest(seed.getBytes("UTF-8")));
return UUID.nameUUIDFromBytes(hash.getBytes(Charset.forName("UTF-8"))).toString();
} catch (NoSuchAlgorithmException | UnsupportedEncodingException ex) {
UUIDGenerator.LOGGER.error("Issue to detect a hash algorithm or with the encoding", ex);
}
return "";
}
}
log4j.rootLogger = INFO, stdlog, FILE
log4j.appender.FILE=org.apache.log4j.FileAppender
log4j.appender.FILE.File=${logfile.name}
log4j.appender.FILE.ImmediateFlush=true
log4j.appender.FILE.Threshold=debug
log4j.appender.FILE.Append=${logfile.append}
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.ConversionPattern=%d{yyyy/MM/dd-HH:mm:ss} %-5p %-25c{1} :: %m%n
log4j.appender.stdlog=org.apache.log4j.ConsoleAppender
log4j.appender.stdlog.target=System.out
log4j.appender.stdlog.layout=org.apache.log4j.PatternLayout
log4j.appender.stdlog.layout.ConversionPattern=%d{yyyy/MM/dd-HH:mm:ss} %-5p %-25c{1} :: %m%n
log4j.logger.org.apache.jena.arq.info=INFO
log4j.logger.org.apache.jena.arq.exec=INFO
log4j.logger.org.apache.jena.tdb.loader=INFO
log4j.logger.org.apache.jena=INFO
log4j.logger.org.apache.jena.riot=INFO
log4j.logger.TDB=INFO
log4j.logger.org.apache.jena.tdb=INFO
log4j.logger.org.apache.jena.tdb.transaction=INFO
log4j.logger.org.apache.jena.tdb.transaction.NodeTableTrans=ALL
log4j.logger.org.joseki=INFO
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment