Commit 7423d71b authored by Bertrand Goupil's avatar Bertrand Goupil

Use of the news ID

- Change UUI temp generation with news Id
- Add news id in the json  response
parent 03100854
......@@ -40,9 +40,9 @@
<factorypathentry kind="VARJAR" id="M2_REPO/net/sourceforge/saxon/saxon/9.1.0.8/saxon-9.1.0.8.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/xom/xom/1.2.5/xom-1.2.5.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/xalan/xalan/2.7.0/xalan-2.7.0.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/Documents/eclipse-workspace/afp-asrael/Limsi-SourceExtractor/lib/jar/stanford-french-corenlp-2016-01-14-models.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/Documents/eclipse-workspace/afp-asrael/Limsi-SourceExtractor/lib/jar/wapiti-1.5.0-win.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/Documents/eclipse-workspace/afp-asrael/Limsi-SourceExtractor/lib/jar/hfst-ol.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/git/code/Limsi-SourceExtractor/lib/jar/stanford-french-corenlp-2016-01-14-models.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/git/code/Limsi-SourceExtractor/lib/jar/wapiti-1.5.0-win.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/git/code/Limsi-SourceExtractor/lib/jar/hfst-ol.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/commons-io/commons-io/2.5/commons-io-2.5.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/org/apache/commons/commons-lang3/3.4/commons-lang3-3.4.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/commons-cli/commons-cli/1.3.1/commons-cli-1.3.1.jar" enabled="true" runInBatchMode="false"/>
......
server:
context-path: /sourceExtractor
context-path: /limsi-sourceExtractor
resource:
#Directory containing the librairies and models
......
......@@ -8,7 +8,7 @@
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.2.RELEASE</version>
<version>1.5.8.RELEASE</version>
</parent>
<name>LimsiSourceExtractor</name>
<properties>
......@@ -156,6 +156,7 @@
<artifactId>spring-boot-starter-tomcat</artifactId>
<scope>provided</scope>
</dependency>
<!-- <dependency> -->
<!-- <groupId>org.springframework.boot</groupId> -->
<!-- <artifactId>spring-boot-starter-actuator</artifactId> -->
......
......@@ -4,7 +4,6 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -22,11 +21,12 @@ import fr.limsi.sourceExtractor.training.AProcessSupport;
public class SimplePreprocessing {
private Pattern pattern;
private Pattern pattern, patternIdentifier;
private AProcessSupport processSupport;
public SimplePreprocessing(AProcessSupport processSupport) {
this.pattern = Pattern.compile("<p>([^<]+)</p>");
this.patternIdentifier = Pattern.compile("<PublicIdentifier>([^<]+)</PublicIdentifier>");
this.processSupport = processSupport;
}
......@@ -36,8 +36,10 @@ public class SimplePreprocessing {
Paths paths = config.getPaths();
Tools tools = config.getTools();
Resources resources = config.getResources();
String fileId = UUID.randomUUID().toString();
String text = extractTextFromXML(fileId, contentStream, memory);
String xmlText = toString(contentStream);
//String fileId = UUID.randomUUID().toString();
String fileId = extractPublicIdentifier(xmlText);
String text = extractTextFromXML(fileId, xmlText, memory);
// we create the files in which we will write the results
File outFilePRIM = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_PRIM, fileId, ".tag");
File outFileSEC = DIRUtils.createDirAndFilesWithExt(paths.DIR_TEST_FILES_UNLABELED_SEC, fileId, ".tag");
......@@ -51,10 +53,25 @@ public class SimplePreprocessing {
return fileId;
}
private String extractTextFromXML(String fileId, InputStream contentStream, Memory memory) throws IOException {
private String toString(InputStream contentStream) throws IOException {
StringWriter writer = new StringWriter();
IOUtils.copy(contentStream, writer, Charsets.UTF_8);
String xmlText = writer.toString();
return xmlText;
}
private String extractPublicIdentifier(String xmlText) {
Matcher matcher = this.patternIdentifier.matcher(xmlText);
String fileId ="";
while (matcher.find()) {
fileId = matcher.group(1);
}
fileId.replaceAll("\\r", "");
return fileId;
}
private String extractTextFromXML(String fileId, String xmlText, Memory memory) {
Matcher matcher = this.pattern.matcher(xmlText);
StringBuilder result = new StringBuilder();
while (matcher.find()) {
......
......@@ -963,7 +963,8 @@ public abstract class AProcess extends AProcessSupport {
TreeSet<SourceAnnotation> orderedAnnotations = new TreeSet<>();
orderedAnnotations.addAll(annotations.values());
StringBuilder jsonResult = new StringBuilder();
jsonResult.append("{\"source_sentences\":[\n");
jsonResult.append("{\"identifier\":\""+fileId+"\",\n");
jsonResult.append("\"source_sentences\":[\n");
int annIndex = 0;
......
## Directory containing the librairies and models
LIB_DIR=/Users/bertrand/Documents/eclipse-workspace/afp-asrael/Limsi-SourceExtractor/lib
LIB_DIR=/Users/bertrand/git/code/Limsi-SourceExtractor/lib
## Directory containing the language-dependent resources
RESOURCES_DIR=/Users/bertrand/Documents/eclipse-workspace/afp-asrael/Limsi-SourceExtractor/resources
RESOURCES_DIR=/Users/bertrand/git/code//Limsi-SourceExtractor/resources
## DATA_DIR should only be set correctly for training the models
## The directory is useless in production mode
......
......@@ -3,9 +3,9 @@ server:
resource:
#Directory containing the librairies and models
lib: /Users/bertrand/Documents/eclipse-workspace/afp-asrael/Limsi-SourceExtractor/lib
lib: /Users/bertrand/git/code/Limsi-SourceExtractor/lib
#Directory containing the language-dependent resources
resources: /Users/bertrand/Documents/eclipse-workspace/afp-asrael/Limsi-SourceExtractor/resources
resources: /Users/bertrand/git/code/Limsi-SourceExtractor/resources
#Directory containing trained data
data: /home/xtannier/Recherche/SourceExtractor
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment