Commit af133c1b authored by Thibault Ehrhart's avatar Thibault Ehrhart

Add classification converter module

parent d1f23217
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>news-kb</artifactId>
<groupId>fr.eurecom</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>fr.eurecom.asrael</groupId>
<artifactId>classification-converter</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>fr.eurecom.asrael.classificationconverter.Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<configuration>
<mainClass>fr.eurecom.asrael.classificationconverter.Main</mainClass>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>apache-jena-libs</artifactId>
<type>pom</type>
<version>3.15.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-core</artifactId>
<version>3.15.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>fr.eurecom.asrael</groupId>
<artifactId>commons</artifactId>
<version>1.0-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>fr.eurecom.asrael</groupId>
<artifactId>news-collector</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20200518</version>
</dependency>
</dependencies>
<packaging>jar</packaging>
</project>
package fr.eurecom.asrael.classificationconverter;
import fr.eurecom.asrael.commons.datatypes.NewsArticle;
import fr.eurecom.asrael.commons.utils.UUIDGenerator;
import fr.eurecom.asrael.newscollector.agencefrancepresse.AgenceFrancePresseMethods;
import org.apache.commons.io.IOUtils;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.ResourceFactory;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.RDFFormat;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import java.io.*;
import java.nio.file.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
public class ClassificationConverter {
private static final Logger LOGGER = LoggerFactory.getLogger(ClassificationConverter.class);
private static final String DUMP_PATH = "./data/dump/agencefrancepresse";
private static final String CLASSIFICATION_JSON_PATH =
"./data/resource/classification/out.pkl.json";
private static final String CLASSIFICATION_XML_PATH = "./data/resource/classification/xml";
public void run() {
final Model model = ModelFactory.createDefaultModel();
// Parse JSON
JSONObject data = null;
try (FileReader reader = new FileReader(String.valueOf(Paths.get(CLASSIFICATION_JSON_PATH)))) {
data = new JSONObject(IOUtils.toString(reader));
} catch (JSONException e) {
LOGGER.error("Error when parsing schema classification json file: " + e.toString());
} catch (IOException e) {
e.printStackTrace();
}
if (data == null) {
return;
}
Iterator<String> keys = data.keys();
while (keys.hasNext()) {
String key = keys.next();
NewsArticle article = null;
// Parse the news article
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
try (InputStream in =
new FileInputStream(
Paths.get(CLASSIFICATION_XML_PATH + FileSystems.getDefault().getSeparator() + key)
.toFile())) {
try {
article = AgenceFrancePresseMethods.parseArticle(inputFactory.createXMLStreamReader(in));
} catch (XMLStreamException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
if (article == null) {
LOGGER.error("NewsArticle is null for key: " + key);
} else {
if (data.get(key) instanceof JSONArray) {
JSONArray schemas = (JSONArray) data.get(key);
for (int i = 0; i < schemas.length(); i++) {
model.add(
ResourceFactory.createResource(
"http://asrael.eurecom.fr/event/"
+ UUIDGenerator.generateUUID(article.getId())),
ResourceFactory.createProperty("http://schema.org/category"),
ResourceFactory.createResource(
"http://asrael.eurecom.fr/category/" + schemas.getString(i)));
}
}
}
}
final Path classificationDir =
Paths.get(DUMP_PATH + FileSystems.getDefault().getSeparator() + "classification");
// Make sure that the dump directory exists
try {
Files.createDirectories(classificationDir);
} catch (IOException e) {
LOGGER.error(e.toString());
}
// Write the model to a turtle file
DateFormat df = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'");
final String rdfFile = df.format(new Date()) + ".class.nt";
LOGGER.info("Writing classifications to file {}", rdfFile);
Path outFile = Paths.get(classificationDir.toString(), rdfFile);
try (final OutputStream outRdf =
Files.newOutputStream(
outFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
RDFDataMgr.write(outRdf, model, RDFFormat.NTRIPLES);
} catch (final IOException e) {
LOGGER.error("Issue to write classifications to file", e);
}
}
}
package fr.eurecom.asrael.classificationconverter;
public class Main {
public static void main(final String... args) {
new ClassificationConverter().run();
}
}
log4j.rootLogger=INFO,stdlog,FILE
log4j.appender.FILE=org.apache.log4j.FileAppender
log4j.appender.FILE.File=ClassificationConverter.log
log4j.appender.FILE.ImmediateFlush=true
log4j.appender.FILE.Threshold=debug
log4j.appender.FILE.Append=true
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.ConversionPattern=%d{yyyy/MM/dd-HH:mm:ss} %-5p %-25c{1} :: %m%n
log4j.appender.stdlog=org.apache.log4j.ConsoleAppender
log4j.appender.stdlog.target=System.out
log4j.appender.stdlog.layout=org.apache.log4j.PatternLayout
log4j.appender.stdlog.layout.ConversionPattern=%d{yyyy/MM/dd-HH:mm:ss} %-5p %-25c{1} :: %m%n
log4j.logger.org.apache.jena.arq.info=INFO
log4j.logger.org.apache.jena.arq.exec=INFO
log4j.logger.org.apache.jena.tdb.loader=INFO
log4j.logger.org.apache.jena=INFO
log4j.logger.org.apache.jena.riot=INFO
log4j.logger.TDB=INFO
log4j.logger.org.apache.jena.tdb=INFO
log4j.logger.org.apache.jena.tdb.transaction=INFO
log4j.logger.org.apache.jena.tdb.transaction.NodeTableTrans=ALL
log4j.logger.org.joseki=INFO
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>fr.eurecom</groupId>
<artifactId>news-kb</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>pom</packaging>
<groupId>fr.eurecom</groupId>
<artifactId>news-kb</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>pom</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<modules>
<module>Commons</module>
<module>adel-annotator</module>
<module>brat-annotator</module>
<module>news-collector</module>
</modules>
<modules>
<module>Commons</module>
<module>adel-annotator</module>
<module>brat-annotator</module>
<module>classification-converter</module>
<module>news-collector</module>
</modules>
</project>
\ No newline at end of file
</project>
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment