Update forbidden subjects codes to allow ranges

parent c80f08a8
......@@ -13,6 +13,7 @@ import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
......@@ -28,9 +29,23 @@ import org.slf4j.LoggerFactory;
public class AgenceFrancePresse {
private static final Logger LOGGER = LoggerFactory.getLogger(AgenceFrancePresse.class);
private final Configuration config;
private Set<String> forbiddenSubjects;
public AgenceFrancePresse(Configuration config) {
this.config = config;
this.forbiddenSubjects = new HashSet<>();
for (String forbiddenSubject : config.getForbiddenSubjects()) {
String[] subjArray = forbiddenSubject.split("-");
long startSubj = Long.parseLong(subjArray[0]);;
long endSubj = startSubj;
if (subjArray.length > 1) {
endSubj = Long.parseLong(subjArray[1]);
}
for (long l = startSubj; l < endSubj; l += 1) {
this.forbiddenSubjects.add(String.valueOf(l));
}
}
}
public void run(
......@@ -159,7 +174,12 @@ public class AgenceFrancePresse {
return;
}
if (!Collections.disjoint(article.getSubjects(), config.getForbiddenSubjects())) {
Set<String> subjects = new HashSet<>();
for (String subject : article.getSubjects()) {
long subjectNum = Long.parseLong(subject.substring(subject.lastIndexOf("/") + 1));
subjects.add(String.valueOf(subjectNum));
}
if (!Collections.disjoint(subjects, forbiddenSubjects)) {
// Ignore articles with certain subjects
AgenceFrancePresse.LOGGER.info(
"Skip article with forbidden subject: "
......
......@@ -18,11 +18,7 @@ forbiddenGenres:
- BioProfile
- PressRelease
forbiddenSubjects:
- http://cv.iptc.org/newscodes/subjectcode/15003000
- http://cv.iptc.org/newscodes/subjectcode/15006000
- http://cv.iptc.org/newscodes/subjectcode/15008000
- http://cv.iptc.org/newscodes/subjectcode/15014000
- http://cv.iptc.org/newscodes/subjectcode/15019000
- 15000000-15999999
# Brat Annotator
annotatorPath: ../news-annotations/src
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment