...
 
Commits (2)
......@@ -7,19 +7,21 @@ public class Memory {
public ConcurrentHashMap<String, String> parsedTexts;
public ConcurrentHashMap<String, String> docTexts;
public ConcurrentHashMap<String, ArrayList<Integer>> sentenceOffsetsByFile;
public ConcurrentHashMap<String, String> revisionDate;
public Memory(ConcurrentHashMap<String, String> parsedTexts, ConcurrentHashMap<String, String> docTexts,
ConcurrentHashMap<String, ArrayList<Integer>> sentenceOffsetsByFile) {
ConcurrentHashMap<String, ArrayList<Integer>> sentenceOffsetsByFile, ConcurrentHashMap<String, String> revisionDate) {
this.parsedTexts = parsedTexts;
this.docTexts = docTexts;
this.sentenceOffsetsByFile = sentenceOffsetsByFile;
this.revisionDate = revisionDate;
}
public void cleanEntry(String fileId) {
this.parsedTexts.remove(fileId);
this.docTexts.remove(fileId);
this.sentenceOffsetsByFile.remove(fileId);
this.revisionDate.remove(fileId);
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -87,7 +87,7 @@ public class SourceExtractorConfig {
}
@PreDestroy
public void destroy(){
public void destroy() {
logger.debug("destroy");
try {
FileUtils.forceDelete(this.tempDir);
......@@ -99,7 +99,7 @@ public class SourceExtractorConfig {
public SourceExtractorConfig() {
this.paths = new Paths();
this.memory = new Memory(new ConcurrentHashMap<>(), new ConcurrentHashMap<>(), new ConcurrentHashMap<>());
this.memory = new Memory(new ConcurrentHashMap<>(), new ConcurrentHashMap<>(), new ConcurrentHashMap<>(), new ConcurrentHashMap<>());
}
public SourceExtractorConfig(File dataDir, File dirLib, File dirResources) {
......
......@@ -4,6 +4,9 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -21,13 +24,18 @@ import fr.limsi.sourceExtractor.training.AProcessSupport;
public class SimplePreprocessing {
private Pattern pattern, patternIdentifier;
private Pattern pattern, patternIdentifier, patternCreationDate;
private AProcessSupport processSupport;
private SimpleDateFormat utcFormat, format2;
public SimplePreprocessing(AProcessSupport processSupport) {
this.pattern = Pattern.compile("<p>([^<]+)</p>");
this.patternIdentifier = Pattern.compile("<PublicIdentifier>([^<]+)</PublicIdentifier>");
this.patternCreationDate = Pattern.compile("<ThisRevisionCreated>([^<]+)</ThisRevisionCreated>");
this.processSupport = processSupport;
this.utcFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'");
this.format2 = new SimpleDateFormat("yyyyMMdd'T'HHmmssXX");
}
public String tagXMLText(InputStream contentStream, SourceExtractorConfig config) throws IOException {
......@@ -40,6 +48,7 @@ public class SimplePreprocessing {
//String fileId = UUID.randomUUID().toString();
String fileId = extractPublicIdentifier(xmlText);
String text = extractTextFromXML(fileId, xmlText, memory);
extractRevisionDate(fileId, xmlText, memory);
// we create the files in which we will write the results
File outFilePRIM = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_PRIM, fileId, ".tag");
File outFileSEC = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_SEC, fileId, ".tag");
......@@ -62,7 +71,7 @@ public class SimplePreprocessing {
private String extractPublicIdentifier(String xmlText) {
Matcher matcher = this.patternIdentifier.matcher(xmlText);
String fileId ="";
String fileId = "";
while (matcher.find()) {
fileId = matcher.group(1);
}
......@@ -70,6 +79,27 @@ public class SimplePreprocessing {
return fileId;
}
private void extractRevisionDate(String fileId, String xmlText, Memory memory) {
Matcher matcher = this.patternCreationDate.matcher(xmlText);
String revisionDateStr = "";
while (matcher.find()) {
revisionDateStr = matcher.group(1);
}
revisionDateStr.replaceAll("\\r", "");
String strDate = "";
try {
if (revisionDateStr.contains("Z")) {
strDate = revisionDateStr;
} else {
Date date = format2.parse(revisionDateStr);
strDate = utcFormat.format(date);
}
} catch (ParseException e) {
}
memory.revisionDate.put(fileId, strDate);
}
private String extractTextFromXML(String fileId, String xmlText, Memory memory) {
Matcher matcher = this.pattern.matcher(xmlText);
......
......@@ -962,13 +962,14 @@ public abstract class AProcess extends AProcessSupport {
private StringBuilder brat2JSON(String fileId, HashMap<String, SourceAnnotation> annotations) {
String docText = this.memory.docTexts.get(fileId);
String revisionDateStr = this.memory.revisionDate.get(fileId);
ArrayList<Integer> sentenceOffsets = this.memory.sentenceOffsetsByFile.get(fileId);
TreeSet<SourceAnnotation> orderedAnnotations = new TreeSet<>();
orderedAnnotations.addAll(annotations.values());
StringBuilder jsonResult = new StringBuilder();
jsonResult.append("{\"identifier\":\""+fileId+"\",\n");
jsonResult.append("\"revisionDate\":\""+revisionDateStr+"\",\n");
jsonResult.append("\"source_sentences\":[\n");
int annIndex = 0;
ArrayList<SourceAnnotation> sentenceAnnotations = new ArrayList<>();
......