Commit f4cf9a55 authored by Thibault Ehrhart's avatar Thibault Ehrhart

Add clusters-converter module

parent 18bd6478
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>news-kb</artifactId>
<groupId>fr.eurecom</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>fr.eurecom.asrael</groupId>
<artifactId>clusters-converter</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>fr.eurecom.asrael.clustersconverter.Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<configuration>
<mainClass>fr.eurecom.asrael.clustersconverter.Main</mainClass>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>apache-jena-libs</artifactId>
<type>pom</type>
<version>3.15.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.25</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-core</artifactId>
<version>3.15.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>fr.eurecom.asrael</groupId>
<artifactId>commons</artifactId>
<version>1.0-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
</dependencies>
<packaging>jar</packaging>
</project>
package fr.eurecom.asrael.clustersconverter;
import fr.eurecom.asrael.commons.utils.UUIDGenerator;
import org.apache.jena.datatypes.xsd.XSDDatatype;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.rdf.model.ResourceFactory;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.RDFFormat;
import org.apache.jena.vocabulary.RDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
public class ClustersConverter {
private static final Logger LOGGER = LoggerFactory.getLogger(ClustersConverter.class);
private static final String CLUSTERS_PATH = "./data/resource/clustering/out";
private static final String DUMP_PATH = "./data/dump/agencefrancepresse";
public void run() {
try {
Files.list(Paths.get(CLUSTERS_PATH + FileSystems.getDefault().getSeparator()))
.filter(file -> file.toFile().isDirectory())
.forEach(
clusteringDir -> {
try {
Files.list(clusteringDir)
.filter(file -> file.toFile().getName().endsWith(".clust"))
.forEach(
file -> {
// Get file creation date for cluster id and output file name generation
BasicFileAttributes attr = null;
try {
attr = Files.readAttributes(file, BasicFileAttributes.class);
} catch (IOException e) {
e.printStackTrace();
}
FileTime fileTime = attr.lastModifiedTime();
DateFormat df = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'");
final String clusteringDate = df.format(fileTime.toMillis());
final Model model = ModelFactory.createDefaultModel();
// Add asrael namespace to the final model
model.setNsPrefix("asrael", "http://asrael.eurecom.fr/asrael#");
model.setNsPrefix("schema", "http://schema.org/");
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
try {
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(file.toFile());
NodeList nList = document.getElementsByTagName("cluster");
for (int i = 0; i < nList.getLength(); i++) {
Node node = nList.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element element = (Element) node;
final String firstEventIdentifier =
element.getAttribute("first_evt_identifier");
final Resource clusterEntity =
ResourceFactory.createResource(
"http://asrael.eurecom.fr/cluster/"
+ UUIDGenerator.generateUUID(
firstEventIdentifier + "|" + clusteringDate));
// RDF Type
model.add(
clusterEntity,
RDF.type,
ResourceFactory.createResource(
"http://schema.org/CreativeWork"));
// Date
model.add(
clusterEntity,
ResourceFactory.createProperty(
"http://schema.org/dateCreated"),
ResourceFactory.createTypedLiteral(
fileTime.toString(), XSDDatatype.XSDdateTime));
// Lead
model.add(
clusterEntity,
ResourceFactory.createProperty(
"http://asrael.eurecom.fr/asrael#lead"),
ResourceFactory.createResource(
"http://asrael.eurecom.fr/news/"
+ UUIDGenerator.generateUUID(firstEventIdentifier)));
NodeList nDocList = element.getElementsByTagName("doc");
for (int j = 0; j < nDocList.getLength(); j++) {
Node nDoc = nDocList.item(j);
if (nDoc.getNodeType() == Node.ELEMENT_NODE) {
Element eDoc = (Element) nDoc;
final String docIdentifier = eDoc.getAttribute("identifier");
// Is Part Of
model.add(
ResourceFactory.createResource(
"http://asrael.eurecom.fr/news/"
+ UUIDGenerator.generateUUID(docIdentifier)),
ResourceFactory.createProperty(
"http://schema.org/isPartOf"),
clusterEntity);
}
}
}
}
} catch (ParserConfigurationException | SAXException | IOException e) {
e.printStackTrace();
}
// Write output
final String rdfFile = clusteringDate + ".clusters.ttl";
final Path outputDir =
Paths.get(
DUMP_PATH
+ FileSystems.getDefault().getSeparator()
+ "clustering");
Path outFile = Paths.get(outputDir.toString(), rdfFile);
// Make sure that the dump directory exists
try {
Files.createDirectories(outputDir);
} catch (IOException e) {
LOGGER.error(e.toString());
}
LOGGER.info("Writing annotations to file {}", outFile);
try (final OutputStream outRdf =
Files.newOutputStream(
outFile,
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING)) {
RDFDataMgr.write(outRdf, model, RDFFormat.TURTLE_PRETTY);
} catch (final IOException e) {
LOGGER.error("Issue to write annotated file", e);
}
});
} catch (IOException e) {
LOGGER.error(e.toString());
}
});
} catch (IOException e) {
LOGGER.error(e.toString());
}
}
}
package fr.eurecom.asrael.clustersconverter;
public class Main {
public static void main(final String... args) {
new ClustersConverter().run();
}
}
log4j.rootLogger=INFO,stdlog,FILE
log4j.appender.FILE=org.apache.log4j.FileAppender
log4j.appender.FILE.File=fr.eurecom.asrael.clustersconverter.ClustersConverter.log
log4j.appender.FILE.ImmediateFlush=true
log4j.appender.FILE.Threshold=debug
log4j.appender.FILE.Append=true
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.ConversionPattern=%d{yyyy/MM/dd-HH:mm:ss} %-5p %-25c{1} :: %m%n
log4j.appender.stdlog=org.apache.log4j.ConsoleAppender
log4j.appender.stdlog.target=System.out
log4j.appender.stdlog.layout=org.apache.log4j.PatternLayout
log4j.appender.stdlog.layout.ConversionPattern=%d{yyyy/MM/dd-HH:mm:ss} %-5p %-25c{1} :: %m%n
log4j.logger.org.apache.jena.arq.info=INFO
log4j.logger.org.apache.jena.arq.exec=INFO
log4j.logger.org.apache.jena.tdb.loader=INFO
log4j.logger.org.apache.jena=INFO
log4j.logger.org.apache.jena.riot=INFO
log4j.logger.TDB=INFO
log4j.logger.org.apache.jena.tdb=INFO
log4j.logger.org.apache.jena.tdb.transaction=INFO
log4j.logger.org.apache.jena.tdb.transaction.NodeTableTrans=ALL
log4j.logger.org.joseki=INFO
......@@ -20,6 +20,7 @@
<module>adel-annotator</module>
<module>brat-annotator</module>
<module>classification-converter</module>
<module>clusters-converter</module>
<module>news-collector</module>
</modules>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment