Commit 4094c856 authored by Thibault Ehrhart's avatar Thibault Ehrhart

Add datasetName property configuration

parent 15d4520f
......@@ -23,10 +23,11 @@ Copy the file [props/default.yml](props/default.yml) into `props/config.yml` and
| Property | Module | Description | Default Value |
|---|---|---|---|
| `resourcesPath` | all | Path to the resources directory. | `./data/resource/agencefrancepresse` |
| `dumpPath` | all | Path to the dump directory. | `./data/dump/agencefrancepresse` |
| `resourcesPath` | all | Path to the resources directory. | `./data/resource` |
| `dumpPath` | all | Path to the dump directory. | `./data/dump` |
| `annotatorPath` | _brat-annotator_ | Path to the src folder of the news-annotations project. | `/path/to/news-annotations/src` |
| `annotatorFile` | _brat-annotator_ | Name of the script file used to annotate articles. | `annotate_article.py` |
| `datasetName` | _news-collector_ | Name of the dataset directory. Must exist and be relative to `resourcesPath`. Also used for generating the output in `dumpPath`. | `agencefrancepresse` |
| `forbiddenKeywords` | _news-collector_ | List of keywords used to ignore certain news. | see [default.yml](props/default.yml) |
| `forbiddenGenres` | _news-collector_ | List of keywords used to ignore certain news. | see [default.yml](props/default.yml) |
| `classificationJsonPath` | _classification-converter_ | Path to the JSON output file generated during classification | `./data/resource/classification/out.pkl.json` |
......
......@@ -55,7 +55,7 @@ public class AdelAnnotator {
public void run() {
try {
Files.list(Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getDumpPath(), config.getDatasetName()))
.filter(file -> file.toFile().isDirectory())
.forEach(
langDir -> {
......
......@@ -47,7 +47,7 @@ public class BratAnnotator {
public void run() {
try {
Files.list(Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getDumpPath(), config.getDatasetName()))
.filter(file -> file.toFile().isDirectory())
.forEach(
langDir -> {
......@@ -118,9 +118,9 @@ public class BratAnnotator {
try {
Files.createDirectories(
Paths.get(
config.getDumpPath()
+ FileSystems.getDefault().getSeparator()
+ langDir.getFileName()));
config.getDumpPath(),
config.getDatasetName(),
langDir.getFileName().toString()));
} catch (IOException e) {
BratAnnotator.LOGGER.error(e.toString());
}
......
......@@ -46,7 +46,8 @@ public class ClassificationConverter {
// Parse JSON
JSONObject data = null;
try (FileReader reader = new FileReader(String.valueOf(Paths.get(config.getClassificationJsonPath())))) {
try (FileReader reader =
new FileReader(String.valueOf(Paths.get(config.getClassificationJsonPath())))) {
data = new JSONObject(IOUtils.toString(reader));
} catch (JSONException e) {
LOGGER.error("Error when parsing schema classification json file: " + e.toString());
......@@ -67,9 +68,7 @@ public class ClassificationConverter {
// Parse the news article
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
try (InputStream in =
new FileInputStream(
Paths.get(config.getClassificationXmlPath() + FileSystems.getDefault().getSeparator() + key)
.toFile())) {
new FileInputStream(Paths.get(config.getClassificationXmlPath(), key).toFile())) {
try {
article = AgenceFrancePresseMethods.parseArticle(inputFactory.createXMLStreamReader(in));
} catch (XMLStreamException e) {
......@@ -99,8 +98,7 @@ public class ClassificationConverter {
}
}
final Path classificationDir =
Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator() + "classification");
final Path classificationDir = Paths.get(config.getDumpPath(), "classification");
// Make sure that the dump directory exists
try {
......
......@@ -39,7 +39,7 @@ public class ClustersConverter {
public void run() {
try {
Files.list(Paths.get(config.getClustersPath() + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getClustersPath()))
.filter(file -> file.toFile().isDirectory())
.forEach(
clusteringLangDir -> {
......@@ -145,8 +145,7 @@ public class ClustersConverter {
// Write output
final String rdfFile = clusteringDate + ".clusters.ttl";
final Path outputDir =
Paths.get(config.getDumpPath() + FileSystems.getDefault().getSeparator() + "clustering");
final Path outputDir = Paths.get(config.getDumpPath(), "clustering");
Path outFile = Paths.get(outputDir.toString(), rdfFile);
// Make sure that the dump directory exists
......
......@@ -15,6 +15,7 @@ public class Configuration {
private String dumpPath;
// News Converter
private String datasetName;
private Set<String> forbiddenKeywords;
private Set<String> forbiddenGenres;
......@@ -63,6 +64,14 @@ public class Configuration {
this.dumpPath = dumpPath;
}
public String getDatasetName() {
return datasetName;
}
public void setDatasetName(String datasetName) {
this.datasetName = datasetName;
}
public Set<String> getForbiddenKeywords() {
return forbiddenKeywords;
}
......@@ -130,6 +139,9 @@ public class Configuration {
+ ", dumpPath='"
+ dumpPath
+ '\''
+ ", datasetName='"
+ datasetName
+ '\''
+ ", forbiddenKeywords="
+ forbiddenKeywords
+ ", forbiddenGenres="
......
......@@ -37,7 +37,7 @@ public class AgenceFrancePresse {
public void run() {
try {
// For each language dir (eg. ENG, FRA, GER)
Files.list(Paths.get(config.getResourcesPath() + FileSystems.getDefault().getSeparator()))
Files.list(Paths.get(config.getResourcesPath(), config.getDatasetName()))
.filter(file -> file.toFile().isDirectory())
.forEach(
langDir -> {
......@@ -82,6 +82,7 @@ public class AgenceFrancePresse {
final Path turtleFile =
Paths.get(
config.getDumpPath(),
config.getDatasetName(),
langDir.getFileName().toString(),
yearDir.getFileName()
+ "_"
......@@ -167,9 +168,7 @@ public class AgenceFrancePresse {
try {
Files.createDirectories(
Paths.get(
config.getDumpPath()
+ FileSystems.getDefault().getSeparator()
+ langDir.getFileName()));
config.getDumpPath(), config.getDatasetName(), langDir.getFileName().toString()));
} catch (IOException e) {
AgenceFrancePresse.LOGGER.error(e.toString());
}
......
# Commons
resourcesPath: ./data/resource/agencefrancepresse
dumpPath: ./data/dump/agencefrancepresse
resourcesPath: ./data/resource
dumpPath: ./data/dump
# News Converter
datasetName: agencefrancepresse
forbiddenKeywords:
- Agenda
- Advisory
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment