Commit fc5be870 authored by Thibault Ehrhart's avatar Thibault Ehrhart

Refactor to use a single configuration file (config.yml)

parent 3383d73b
......@@ -46,4 +46,5 @@ buildNumber.properties
###
data
!data/resource/iptc
*.log
props/config.yml
*.log
\ No newline at end of file
......@@ -78,5 +78,11 @@
<artifactId>json</artifactId>
<version>20200518</version>
</dependency>
<dependency>
<groupId>fr.eurecom.asrael</groupId>
<artifactId>commons</artifactId>
<version>1.0-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
package fr.eurecom.asrael.adelannotator;
import fr.eurecom.asrael.commons.utils.Configuration;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
......@@ -40,17 +41,21 @@ import org.slf4j.LoggerFactory;
public class AdelAnnotator {
private static final Logger LOGGER = LoggerFactory.getLogger(AdelAnnotator.class);
private static final String DUMP_PATH = "./data/dump/agencefrancepresse";
private static final Pattern ADEL_URI_PATTERN =
Pattern.compile("((?: +)(?:[^ ]+)(?: +)(?:(?!\").)|^)<(.+)>", Pattern.MULTILINE);
private static final Pattern OPENING_HTML_TAG_PATTERN =
Pattern.compile("<(?:(?! ))(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>");
private static final Pattern CLOSING_HTML_TAG_PATTERN =
Pattern.compile("<\\/(?:(?! ))(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>");
private final Configuration config;
public AdelAnnotator(Configuration config) {
this.config = config;
}
public void run() {
try {
Files.list(Paths.get(AdelAnnotator.DUMP_PATH + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator()))
.filter(file -> file.toFile().isDirectory())
.forEach(
langDir -> {
......
package fr.eurecom.asrael.adelannotator;
import fr.eurecom.asrael.commons.utils.Configuration;
import java.io.IOException;
public class Main {
public static void main(final String... args) {
new AdelAnnotator().run();
try {
new AdelAnnotator(Configuration.load()).run();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
......@@ -10,16 +10,16 @@ mvn -U clean package
## Configuration
Before running, make sure to copy/rename the file `props/config.properties.default` into `props/config.properties` and open it to set up the properties:
Before running, make sure to copy/rename the file `props/default.yml` into `props/config.yml` and open it to set up the properties:
* **annotator.path** - path to the `src` folder of the [news-annotations](https://gitlab.eurecom.fr/asrael/news-annotations/) project.\
Example: `annotator.path=/path/to/news-annotations/src`
* **annotatorPath** - path to the `src` folder of the [news-annotations](https://gitlab.eurecom.fr/asrael/news-annotations/) project.\
Example: `annotatorPath=/path/to/news-annotations/src`
* **annotator.file** - name of the script file used to annotate articles.\
Example: `annotator.file=annotate_article.py`
* **annotatorFile** - name of the script file used to annotate articles.\
Example: `annotatorFile=annotate_article.py`
## Usage
```
java -Dlogfile.name=asrael-bratannotator.log -Dlogfile.append=true -jar target/BratAnnotator-1.0-SNAPSHOT.jar
mvn exec:java -Dlogfile.name=asrael-bratannotator.log -Dlogfile.append=true -pl brat-annotator
```
annotator.path=
annotator.file=annotate_article.py
......@@ -7,7 +7,22 @@ import com.pengyifan.brat.io.BratDocumentReader;
import fr.eurecom.asrael.commons.datatypes.Annotation;
import fr.eurecom.asrael.commons.datatypes.NewsArticle;
import fr.eurecom.asrael.commons.datatypes.WikidataEvent;
import fr.eurecom.asrael.commons.utils.Configuration;
import fr.eurecom.asrael.commons.utils.UUIDGenerator;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.StringReader;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.FilenameUtils;
import org.apache.jena.query.QueryExecution;
import org.apache.jena.query.QueryExecutionFactory;
......@@ -22,34 +37,17 @@ import org.apache.jena.riot.RDFFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.file.*;
import java.util.List;
import java.util.Properties;
import java.util.Set;
public class BratAnnotator {
private static final Logger LOGGER = LoggerFactory.getLogger(BratAnnotator.class);
private static final String DUMP_PATH = "./data/dump/agencefrancepresse";
private static final String RESOURCES_PATH= "./data/resource/agencefrancepresse";
private String annotatorPath;
private String annotatorFile;
public BratAnnotator() {
try (InputStream input = new FileInputStream("props/config.properties")) {
Properties prop = new Properties();
prop.load(input);
annotatorPath = prop.getProperty("annotator.path");
annotatorFile = prop.getProperty("annotator.file");
} catch (IOException ex) {
ex.printStackTrace();
}
private final Configuration config;
public BratAnnotator(Configuration config) {
this.config = config;
}
public void run() {
try {
Files.list(Paths.get(BratAnnotator.DUMP_PATH + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator()))
.filter(file -> file.toFile().isDirectory())
.forEach(
langDir -> {
......@@ -108,7 +106,7 @@ public class BratAnnotator {
finalModel.add(event.getModel());
finalModel.add(
ResourceFactory.createResource(
NewsArticle.BASE_URI
NewsArticle.BASE_URI
+ UUIDGenerator.generateUUID(identifier)),
ResourceFactory.createProperty(
"http://iptc.org/std/rNews/2011-10-07#about"),
......@@ -120,7 +118,7 @@ public class BratAnnotator {
try {
Files.createDirectories(
Paths.get(
DUMP_PATH
config.getDumpPath()
+ FileSystems.getDefault().getSeparator()
+ langDir.getFileName()));
} catch (IOException e) {
......@@ -166,8 +164,9 @@ public class BratAnnotator {
try {
// -a <text to annotate> -s <schema id> -w <wiki instance id>
ProcessBuilder pb =
new ProcessBuilder("python", annotatorFile, "-a", desc, "-s", "S0", "-w", "Q19882906");
pb.directory(new File(annotatorPath));
new ProcessBuilder(
"python", config.getAnnotatorFile(), "-a", desc, "-s", "S0", "-w", "Q19882906");
pb.directory(new File(config.getAnnotatorPath()));
pb.redirectErrorStream(true);
Process p = pb.start();
BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()));
......
package fr.eurecom.asrael.bratannotator;
import fr.eurecom.asrael.commons.utils.Configuration;
import java.io.IOException;
public class Main {
public static void main(final String... args) {
new BratAnnotator().run();
try {
new BratAnnotator(Configuration.load()).run();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
package fr.eurecom.asrael.classificationconverter;
import fr.eurecom.asrael.commons.datatypes.NewsArticle;
import fr.eurecom.asrael.commons.utils.Configuration;
import fr.eurecom.asrael.commons.utils.UUIDGenerator;
import fr.eurecom.asrael.newscollector.agencefrancepresse.AgenceFrancePresseMethods;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.IOUtils;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
......@@ -15,28 +33,20 @@ import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import java.io.*;
import java.nio.file.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
public class ClassificationConverter {
private static final Logger LOGGER = LoggerFactory.getLogger(ClassificationConverter.class);
private static final String DUMP_PATH = "./data/dump/agencefrancepresse";
private static final String CLASSIFICATION_JSON_PATH =
"./data/resource/classification/out.pkl.json";
private static final String CLASSIFICATION_XML_PATH = "./data/resource/classification/xml";
private final Configuration config;
public ClassificationConverter(Configuration config) {
this.config = config;
}
public void run() {
final Model model = ModelFactory.createDefaultModel();
// Parse JSON
JSONObject data = null;
try (FileReader reader = new FileReader(String.valueOf(Paths.get(CLASSIFICATION_JSON_PATH)))) {
try (FileReader reader = new FileReader(String.valueOf(Paths.get(config.getClassificationJsonPath())))) {
data = new JSONObject(IOUtils.toString(reader));
} catch (JSONException e) {
LOGGER.error("Error when parsing schema classification json file: " + e.toString());
......@@ -58,7 +68,7 @@ public class ClassificationConverter {
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
try (InputStream in =
new FileInputStream(
Paths.get(CLASSIFICATION_XML_PATH + FileSystems.getDefault().getSeparator() + key)
Paths.get(config.getClassificationXmlPath() + FileSystems.getDefault().getSeparator() + key)
.toFile())) {
try {
article = AgenceFrancePresseMethods.parseArticle(inputFactory.createXMLStreamReader(in));
......@@ -90,7 +100,7 @@ public class ClassificationConverter {
}
final Path classificationDir =
Paths.get(DUMP_PATH + FileSystems.getDefault().getSeparator() + "classification");
Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator() + "classification");
// Make sure that the dump directory exists
try {
......
package fr.eurecom.asrael.classificationconverter;
import fr.eurecom.asrael.commons.utils.Configuration;
import java.io.IOException;
public class Main {
public static void main(final String... args) {
new ClassificationConverter().run();
try {
new ClassificationConverter(Configuration.load()).run();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
package fr.eurecom.asrael.clustersconverter;
import fr.eurecom.asrael.commons.utils.Configuration;
import fr.eurecom.asrael.commons.utils.UUIDGenerator;
import org.apache.jena.datatypes.xsd.XSDDatatype;
import org.apache.jena.rdf.model.Model;
......@@ -30,12 +31,15 @@ import java.text.SimpleDateFormat;
public class ClustersConverter {
private static final Logger LOGGER = LoggerFactory.getLogger(ClustersConverter.class);
private static final String CLUSTERS_PATH = "./data/resource/clustering/out";
private static final String DUMP_PATH = "./data/dump/agencefrancepresse";
private final Configuration config;
public ClustersConverter(Configuration config) {
this.config = config;
}
public void run() {
try {
Files.list(Paths.get(CLUSTERS_PATH + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getClustersPath() + FileSystems.getDefault().getSeparator()))
.filter(file -> file.toFile().isDirectory())
.forEach(
clusteringLangDir -> {
......@@ -142,7 +146,7 @@ public class ClustersConverter {
// Write output
final String rdfFile = clusteringDate + ".clusters.ttl";
final Path outputDir =
Paths.get(DUMP_PATH + FileSystems.getDefault().getSeparator() + "clustering");
Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator() + "clustering");
Path outFile = Paths.get(outputDir.toString(), rdfFile);
// Make sure that the dump directory exists
......
package fr.eurecom.asrael.clustersconverter;
import fr.eurecom.asrael.commons.utils.Configuration;
import java.io.IOException;
public class Main {
public static void main(final String... args) {
new ClustersConverter().run();
try {
new ClustersConverter(Configuration.load()).run();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
......@@ -20,5 +20,11 @@
<type>pom</type>
<version>3.15.0</version>
</dependency>
<dependency>
<groupId>org.yaml</groupId>
<artifactId>snakeyaml</artifactId>
<version>1.26</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
package fr.eurecom.asrael.commons.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
import java.util.stream.Collectors;
import org.yaml.snakeyaml.Yaml;
import org.yaml.snakeyaml.representer.Representer;
public class Configuration {
// Common
private String resourcesPath;
private String dumpPath;
// News Converter
private Set<String> forbiddenKeywords;
private Set<String> forbiddenGenres;
// Brat Annotator
private String annotatorPath;
private String annotatorFile;
// Classification Converter
private String classificationJsonPath;
private String classificationXmlPath;
// Clusters Converter
private String clustersPath;
public static Configuration load() throws IOException {
File file = new File("props/config.yml");
if (!file.exists() || !file.isFile() || !file.canRead()) {
return load("props/default.yml");
}
return load(file.getPath());
}
public static Configuration load(String configFilePath) throws IOException {
Configuration config;
try (final InputStream input = new FileInputStream(configFilePath)) {
Representer representer = new Representer();
representer.getPropertyUtils().setSkipMissingProperties(true);
config = new Yaml(representer).loadAs(input, Configuration.class);
return config;
}
}
public String getResourcesPath() {
return resourcesPath;
}
public void setResourcesPath(String resourcesPath) {
this.resourcesPath = resourcesPath;
}
public String getDumpPath() {
return dumpPath;
}
public void setDumpPath(String dumpPath) {
this.dumpPath = dumpPath;
}
public Set<String> getForbiddenKeywords() {
return forbiddenKeywords;
}
public void setForbiddenKeywords(Set<String> forbiddenKeywords) {
this.forbiddenKeywords =
forbiddenKeywords.stream().map(String::toLowerCase).collect(Collectors.toSet());
}
public Set<String> getForbiddenGenres() {
return forbiddenGenres;
}
public void setForbiddenGenres(Set<String> forbiddenGenres) {
this.forbiddenGenres =
forbiddenGenres.stream().map(String::toLowerCase).collect(Collectors.toSet());
}
public String getAnnotatorPath() {
return annotatorPath;
}
public void setAnnotatorPath(String annotatorPath) {
this.annotatorPath = annotatorPath;
}
public String getAnnotatorFile() {
return annotatorFile;
}
public void setAnnotatorFile(String annotatorFile) {
this.annotatorFile = annotatorFile;
}
public String getClassificationJsonPath() {
return classificationJsonPath;
}
public void setClassificationJsonPath(String classificationJsonPath) {
this.classificationJsonPath = classificationJsonPath;
}
public String getClassificationXmlPath() {
return classificationXmlPath;
}
public void setClassificationXmlPath(String classificationXmlPath) {
this.classificationXmlPath = classificationXmlPath;
}
public String getClustersPath() {
return clustersPath;
}
public void setClustersPath(String clustersPath) {
this.clustersPath = clustersPath;
}
@Override
public String toString() {
return "Configuration{"
+ "resourcesPath='"
+ resourcesPath
+ '\''
+ ", dumpPath='"
+ dumpPath
+ '\''
+ ", forbiddenKeywords="
+ forbiddenKeywords
+ ", forbiddenGenres="
+ forbiddenGenres
+ ", annotatorPath='"
+ annotatorPath
+ '\''
+ ", annotatorFile='"
+ annotatorFile
+ '\''
+ ", classificationJsonPath='"
+ classificationJsonPath
+ '\''
+ ", classificationXmlPath='"
+ classificationXmlPath
+ '\''
+ ", clustersPath='"
+ clustersPath
+ '\''
+ '}';
}
}
package fr.eurecom.asrael.newscollector;
import fr.eurecom.asrael.commons.utils.Configuration;
import fr.eurecom.asrael.newscollector.agencefrancepresse.AgenceFrancePresse;
import java.io.IOException;
public final class Main {
public static void main(final String... args) {
new AgenceFrancePresse().run();
try {
new AgenceFrancePresse(Configuration.load()).run();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
......@@ -2,6 +2,7 @@ package fr.eurecom.asrael.newscollector.agencefrancepresse;
import fr.eurecom.asrael.commons.datatypes.ImageObject;
import fr.eurecom.asrael.commons.datatypes.NewsArticle;
import fr.eurecom.asrael.commons.utils.Configuration;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
......@@ -12,9 +13,10 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import org.apache.jena.rdf.model.Model;
......@@ -26,15 +28,16 @@ import org.slf4j.LoggerFactory;
public class AgenceFrancePresse {
private static final Logger LOGGER = LoggerFactory.getLogger(AgenceFrancePresse.class);
private static final String RESOURCES_PATH = "./data/resource/agencefrancepresse";
private static final String DUMP_PATH = "./data/dump/agencefrancepresse";
private final Configuration config;
public AgenceFrancePresse(Configuration config) {
this.config = config;
}
public void run() {
try {
// For each language dir (eg. ENG, FRA, GER)
Files.list(
Paths.get(
AgenceFrancePresse.RESOURCES_PATH + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getResourcesPath() + FileSystems.getDefault().getSeparator()))
.filter(file -> file.toFile().isDirectory())
.forEach(
langDir -> {
......@@ -78,7 +81,7 @@ public class AgenceFrancePresse {
private void processDir(Path dayDir, Path monthDir, Path yearDir, Path langDir) {
final Path turtleFile =
Paths.get(
DUMP_PATH,
config.getDumpPath(),
langDir.getFileName().toString(),
yearDir.getFileName()
+ "_"
......@@ -112,18 +115,29 @@ public class AgenceFrancePresse {
AgenceFrancePresseMethods.parseArticle(
inputFactory.createXMLStreamReader(in));
if (article != null) {
if (!Collections.disjoint(
article.getKeywords(),
Arrays.asList("Agenda", "Advisory", "COMMUNIQUÉ-BUSINESS-WIRE"))) {
Set<String> lowerCaseKeywords =
article.getKeywords().stream()
.map(String::toLowerCase)
.collect(Collectors.toSet());
if (!Collections.disjoint(lowerCaseKeywords, config.getForbiddenKeywords())) {
// Ignore articles with certain keywords
System.out.println(
"Skip article with Agenda/Advisory/Business Wire: " + article.getId());
"Skip article with forbidden keyword: "
+ article.getId()
+ " (keywords: "
+ String.join(", ", article.getKeywords())
+ ")");
return;
}
if (article.getGenre().equals("TextProgram")) {
if (config.getForbiddenGenres().contains(article.getGenre().toLowerCase())) {
// Ignore articles with certain genres
System.out.println(
"Skip article with genre TextProgram: " + article.getId());
"Skip article with forbidden genre: "
+ article.getId()
+ " (genre: "
+ article.getGenre()
+ ")");
return;
}
......@@ -152,7 +166,10 @@ public class AgenceFrancePresse {
// Make sure that the dump directory exists
try {
Files.createDirectories(
Paths.get(DUMP_PATH + FileSystems.getDefault().getSeparator() + langDir.getFileName()));
Paths.get(
config.getDumpPath()
+ FileSystems.getDefault().getSeparator()
+ langDir.getFileName()));
} catch (IOException e) {
AgenceFrancePresse.LOGGER.error(e.toString());
}
......
# Commons
resourcesPath: ./data/resource/agencefrancepresse
dumpPath: ./data/dump/agencefrancepresse
# News Converter
forbiddenKeywords:
- Agenda
- Advisory
- COMMUNIQUÉ-BUSINESS-WIRE
forbiddenGenres:
- TextProgram
# Brat Annotator
annotatorPath: ../news-annotations/src
annotatorFile: annotate_article.py
# Classification Converter
classificationJsonPath: ./data/resource/classification/out.pkl.json
classificationXmlPath: ./data/resource/classification/xml
# Clusters Converter
clustersPath: ./data/resource/clustering/out
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment