Commit 42a0c05a authored by Bertrand Goupil's avatar Bertrand Goupil

Add NewsML revisionDate in Json response

parent d3a81554
...@@ -7,19 +7,21 @@ public class Memory { ...@@ -7,19 +7,21 @@ public class Memory {
public ConcurrentHashMap<String, String> parsedTexts; public ConcurrentHashMap<String, String> parsedTexts;
public ConcurrentHashMap<String, String> docTexts; public ConcurrentHashMap<String, String> docTexts;
public ConcurrentHashMap<String, ArrayList<Integer>> sentenceOffsetsByFile; public ConcurrentHashMap<String, ArrayList<Integer>> sentenceOffsetsByFile;
public ConcurrentHashMap<String, String> revisionDate;
public Memory(ConcurrentHashMap<String, String> parsedTexts, ConcurrentHashMap<String, String> docTexts, public Memory(ConcurrentHashMap<String, String> parsedTexts, ConcurrentHashMap<String, String> docTexts,
ConcurrentHashMap<String, ArrayList<Integer>> sentenceOffsetsByFile) { ConcurrentHashMap<String, ArrayList<Integer>> sentenceOffsetsByFile, ConcurrentHashMap<String, String> revisionDate) {
this.parsedTexts = parsedTexts; this.parsedTexts = parsedTexts;
this.docTexts = docTexts; this.docTexts = docTexts;
this.sentenceOffsetsByFile = sentenceOffsetsByFile; this.sentenceOffsetsByFile = sentenceOffsetsByFile;
this.revisionDate = revisionDate;
} }
public void cleanEntry(String fileId) { public void cleanEntry(String fileId) {
this.parsedTexts.remove(fileId); this.parsedTexts.remove(fileId);
this.docTexts.remove(fileId); this.docTexts.remove(fileId);
this.sentenceOffsetsByFile.remove(fileId); this.sentenceOffsetsByFile.remove(fileId);
this.revisionDate.remove(fileId);
} }
} }
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -85,9 +85,9 @@ public class SourceExtractorConfig { ...@@ -85,9 +85,9 @@ public class SourceExtractorConfig {
loadWapitiModels(""); loadWapitiModels("");
//config(); //config();
} }
@PreDestroy @PreDestroy
public void destroy(){ public void destroy() {
logger.debug("destroy"); logger.debug("destroy");
try { try {
FileUtils.forceDelete(this.tempDir); FileUtils.forceDelete(this.tempDir);
...@@ -99,7 +99,7 @@ public class SourceExtractorConfig { ...@@ -99,7 +99,7 @@ public class SourceExtractorConfig {
public SourceExtractorConfig() { public SourceExtractorConfig() {
this.paths = new Paths(); this.paths = new Paths();
this.memory = new Memory(new ConcurrentHashMap<>(), new ConcurrentHashMap<>(), new ConcurrentHashMap<>()); this.memory = new Memory(new ConcurrentHashMap<>(), new ConcurrentHashMap<>(), new ConcurrentHashMap<>(), new ConcurrentHashMap<>());
} }
public SourceExtractorConfig(File dataDir, File dirLib, File dirResources) { public SourceExtractorConfig(File dataDir, File dirLib, File dirResources) {
......
...@@ -4,6 +4,9 @@ import java.io.File; ...@@ -4,6 +4,9 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.StringWriter; import java.io.StringWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -21,13 +24,18 @@ import fr.limsi.sourceExtractor.training.AProcessSupport; ...@@ -21,13 +24,18 @@ import fr.limsi.sourceExtractor.training.AProcessSupport;
public class SimplePreprocessing { public class SimplePreprocessing {
private Pattern pattern, patternIdentifier; private Pattern pattern, patternIdentifier, patternCreationDate;
private AProcessSupport processSupport; private AProcessSupport processSupport;
private SimpleDateFormat utcFormat, format2;
public SimplePreprocessing(AProcessSupport processSupport) { public SimplePreprocessing(AProcessSupport processSupport) {
this.pattern = Pattern.compile("<p>([^<]+)</p>"); this.pattern = Pattern.compile("<p>([^<]+)</p>");
this.patternIdentifier = Pattern.compile("<PublicIdentifier>([^<]+)</PublicIdentifier>"); this.patternIdentifier = Pattern.compile("<PublicIdentifier>([^<]+)</PublicIdentifier>");
this.patternCreationDate = Pattern.compile("<ThisRevisionCreated>([^<]+)</ThisRevisionCreated>");
this.processSupport = processSupport; this.processSupport = processSupport;
this.utcFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'");
this.format2 = new SimpleDateFormat("yyyyMMdd'T'HHmmssXX");
} }
public String tagXMLText(InputStream contentStream, SourceExtractorConfig config) throws IOException { public String tagXMLText(InputStream contentStream, SourceExtractorConfig config) throws IOException {
...@@ -40,6 +48,7 @@ public class SimplePreprocessing { ...@@ -40,6 +48,7 @@ public class SimplePreprocessing {
//String fileId = UUID.randomUUID().toString(); //String fileId = UUID.randomUUID().toString();
String fileId = extractPublicIdentifier(xmlText); String fileId = extractPublicIdentifier(xmlText);
String text = extractTextFromXML(fileId, xmlText, memory); String text = extractTextFromXML(fileId, xmlText, memory);
extractRevisionDate(fileId, xmlText, memory);
// we create the files in which we will write the results // we create the files in which we will write the results
File outFilePRIM = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_PRIM, fileId, ".tag"); File outFilePRIM = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_PRIM, fileId, ".tag");
File outFileSEC = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_SEC, fileId, ".tag"); File outFileSEC = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_SEC, fileId, ".tag");
...@@ -62,7 +71,7 @@ public class SimplePreprocessing { ...@@ -62,7 +71,7 @@ public class SimplePreprocessing {
private String extractPublicIdentifier(String xmlText) { private String extractPublicIdentifier(String xmlText) {
Matcher matcher = this.patternIdentifier.matcher(xmlText); Matcher matcher = this.patternIdentifier.matcher(xmlText);
String fileId =""; String fileId = "";
while (matcher.find()) { while (matcher.find()) {
fileId = matcher.group(1); fileId = matcher.group(1);
} }
...@@ -70,6 +79,27 @@ public class SimplePreprocessing { ...@@ -70,6 +79,27 @@ public class SimplePreprocessing {
return fileId; return fileId;
} }
private void extractRevisionDate(String fileId, String xmlText, Memory memory) {
Matcher matcher = this.patternCreationDate.matcher(xmlText);
String revisionDateStr = "";
while (matcher.find()) {
revisionDateStr = matcher.group(1);
}
revisionDateStr.replaceAll("\\r", "");
String strDate = "";
try {
if (revisionDateStr.contains("Z")) {
strDate = revisionDateStr;
} else {
Date date = format2.parse(revisionDateStr);
strDate = utcFormat.format(date);
}
} catch (ParseException e) {
}
memory.revisionDate.put(fileId, strDate);
}
private String extractTextFromXML(String fileId, String xmlText, Memory memory) { private String extractTextFromXML(String fileId, String xmlText, Memory memory) {
Matcher matcher = this.pattern.matcher(xmlText); Matcher matcher = this.pattern.matcher(xmlText);
......
...@@ -962,13 +962,14 @@ public abstract class AProcess extends AProcessSupport { ...@@ -962,13 +962,14 @@ public abstract class AProcess extends AProcessSupport {
private StringBuilder brat2JSON(String fileId, HashMap<String, SourceAnnotation> annotations) { private StringBuilder brat2JSON(String fileId, HashMap<String, SourceAnnotation> annotations) {
String docText = this.memory.docTexts.get(fileId); String docText = this.memory.docTexts.get(fileId);
String revisionDateStr = this.memory.revisionDate.get(fileId);
ArrayList<Integer> sentenceOffsets = this.memory.sentenceOffsetsByFile.get(fileId); ArrayList<Integer> sentenceOffsets = this.memory.sentenceOffsetsByFile.get(fileId);
TreeSet<SourceAnnotation> orderedAnnotations = new TreeSet<>(); TreeSet<SourceAnnotation> orderedAnnotations = new TreeSet<>();
orderedAnnotations.addAll(annotations.values()); orderedAnnotations.addAll(annotations.values());
StringBuilder jsonResult = new StringBuilder(); StringBuilder jsonResult = new StringBuilder();
jsonResult.append("{\"identifier\":\""+fileId+"\",\n"); jsonResult.append("{\"identifier\":\""+fileId+"\",\n");
jsonResult.append("\"revisionDate\":\""+revisionDateStr+"\",\n");
jsonResult.append("\"source_sentences\":[\n"); jsonResult.append("\"source_sentences\":[\n");
int annIndex = 0; int annIndex = 0;
ArrayList<SourceAnnotation> sentenceAnnotations = new ArrayList<>(); ArrayList<SourceAnnotation> sentenceAnnotations = new ArrayList<>();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment