Commit a8896d89 authored by Bertrand Goupil's avatar Bertrand Goupil

Merge branch 'dev' into 'master'

Wapiti memory leak fix

See merge request !6
parents d863ccd8 63f0e1b5
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
<attributes> <attributes>
<attribute name="optional" value="true"/> <attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" path=".apt_generated"> <classpathentry kind="src" path=".apt_generated">
...@@ -32,5 +33,11 @@ ...@@ -32,5 +33,11 @@
<attribute name="optional" value="true"/> <attribute name="optional" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" output="target/test-classes" path=".apt_generated_tests">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/> <classpathentry kind="output" path="target/classes"/>
</classpath> </classpath>
...@@ -30,6 +30,11 @@ ...@@ -30,6 +30,11 @@
<arguments> <arguments>
</arguments> </arguments>
</buildCommand> </buildCommand>
<buildCommand>
<name>org.springframework.ide.eclipse.boot.validation.springbootbuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec> </buildSpec>
<natures> <natures>
<nature>org.eclipse.jem.workbench.JavaEMFNature</nature> <nature>org.eclipse.jem.workbench.JavaEMFNature</nature>
......
...@@ -10,4 +10,5 @@ org.eclipse.jdt.core.compiler.problem.assertIdentifier=error ...@@ -10,4 +10,5 @@ org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.processAnnotations=enabled org.eclipse.jdt.core.compiler.processAnnotations=enabled
org.eclipse.jdt.core.compiler.release=disabled
org.eclipse.jdt.core.compiler.source=1.8 org.eclipse.jdt.core.compiler.source=1.8
<?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0"> <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
<wb-module deploy-name="limsiSourceExtractor"> <wb-module deploy-name="limsiSourceExtractor">
<wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/> <wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/>
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/> <wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/> <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/> <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/.apt_generated"/> <wb-resource deploy-path="/WEB-INF/classes" source-path="/.apt_generated"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/.apt_generated_tests"/>
<property name="java-output-path" value="/Limsi-SourceExtractor/target/classes"/> <property name="java-output-path" value="/Limsi-SourceExtractor/target/classes"/>
<property name="context-root" value="limsiSourceExtractor"/> <property name="context-root" value="limsiSourceExtractor"/>
</wb-module> </wb-module>
</project-modules> </project-modules>
boot.validation.initialized=true
eclipse.preferences.version=1
FROM frolvlad/alpine-java:jdk8-slim
MAINTAINER Duc Cao (tien-duc.cao@inria.fr)
RUN apk add maven
RUN apk add --no-cache gcc musl-dev
RUN apk add make
RUN mkdir /default
WORKDIR /default
COPY pom.xml pom.xml
COPY lib lib
RUN mvn dependency:go-offline -B
RUN mkdir data
COPY src src
COPY run.sh run.sh
COPY resources resources
COPY wapiti-1.5.0 wapiti-1.5.0
RUN mvn clean install
RUN cd /default/wapiti-1.5.0 && make wapiti && cp wapiti /default/lib
ENTRYPOINT ["/bin/sh", "run.sh"]
This tool was originally created in summer 2016 by Gabriel Bellard and Xavier Tannier, at LIMSI-CNRS, Orsay, France.
Code contributors:
- Gabriel Bellard (main contributor)
- Xavier Tannier (contact)
Thanks to the annotators:
- Dominique Ferrandini
- Samuel Laurent
- Daniel Oudet
- Denis Teyssou
Thanks to Christophe Boumenot that helped us with the Windows Wapiti library.
This diff is collapsed.
license to be defined
Software created by Gabriel Bellard and Xavier Tannier at LIMSI.
Please do not reuse or fork without permision (until we have defined the distribution license).
# Source Extractor (in French only)
**SourceExtractor** is a [CRF](https://en.wikipedia.org/wiki/Conditional_random_field)-based tool for extracting primary and secondary sources from news articles in French.
It detects primary sources, secondary sources, performs coreference resolution on sources, and detects anonymous sources. It can produce Brat format for visualization or JSON format for a machine-readable output.
## Requirements
* **java 8+**
* On Windows, give java at least 1 Gb memory: *java -Xmx1g*
* Extensively tested on **Linux**, tested on **Windows**, untested on **Mac** (but the necessary Wapiti native library is loaded).
If you need to recompile Max and Windows libraries, see https://github.com/kermitt2/Wapiti
## Installation
The archive contains the following files and directories:
- `sourceextractor.jar`
- `config.properties`: edit this file with links to the `lib` and `resources` directory on your computer (see below)
- `lib`: contains external librairies and saved models.
Install this directory wherever you want and
edit the `LIB_DIR` property in the configuration file
- `resources`: contains the language-dependent resources for the system
Install this directory wherever you want and
edit the `RESOURCES_DIR` property in the configuration file
- `code`: the source code
- `LICENSE.txt`: the license file
- `README.md`: this file
- `RESULTS.txt`: the results obtained on an annotated test set by the different models
- `AUTHORS.txt`
## Example usage:
The command is similar on Windows or Linux/MAC, except that a different Wapiti jar must be loaded. Wapiti jars are located in directory `lib/wapiti`. [Wapiti](https://wapiti.limsi.fr/) is the tool used for learning and running the CRF model.
**Linux/MAC**
`java -Xmx1g -cp lib/jar/*:lib/jar/wapiti/wapiti-1.5.0-lin.jar:source-extractor-0.1.jar fr.limsi.sourceExtractor.SourceExtractor -d <INPUT DIR> -o <OUTPUT DIR> -c <CONFIGURATION FILE> -j <THREAD NUMBER>`
**Windows**
`java -Xrs -Xmx1g -cp lib\jar\*;lib\jar\wapiti\wapiti-1.5.0-win.jar;source-extractor-0.1.jar fr.limsi.sourceExtractor.SourceExtractor -d <INPUT DIR> -o <OUTPUT DIR> -c <CONFIGURATION FILE> -j <THREAD NUMBER>`
### Input:
* can be a file (`-f`) or a directory (`-d`) containing documents (for convience notation only, we can actually feed a file or a directory with both -f and -d).
* Default input is textual documents. Use option `-newsml` for NewsML files. (Actually the option -newsml accepts any XML document which content is in <p> elements)
### Output:
* `-o <DIR>` specifies the output directory
* default output format is JSON. Use option `-b` or `--brat` for a [Brat](http://brat.nlplab.org/) output
* See a description of the JSON output below
### Configuration file:
`config.properties` must be edited with the following information:
* `LIB_DIR` = <path to the directory 'lib' downloaded with the distribution
* `RESOURCES_DIR` = <path to the directory 'resources' downloaded with the distribution
* `DATA_DIR` is only necessary for training purpose, there is no need to set it properly in production mode.
## Want a faster process?
* Multi-threading:
Default is single threading. Use `-j N` for using N threads.
* No secondary sources.
Use option `-p` to cancel the extraction of secondary sources. Both loading phase and process will be (much) faster. Loading phase will skip the huge list of media names and process phase will skip the Wapiti extraction of secondary sources.
## License & Co
### License
See the file `LICENSE.txt`
### Third-party librairies and licenses
See the file `THIRD-PARTY.txt`
## Technical details
### JSON output format
Here is what a JSON output looks like
```javascript
{"source_sentences":[
{"text":"Il a affirmé mardi devant les juges de la CPI n'être responsable d'\"aucune goutte de sang\" versée lors des violences ayant déchiré la Côte d'Ivoire en 2010-2011.",
"sources":[
{"start":635, // start offset of the source in the entire document
"end":637, // end offset of the source in the entire document
"type":"SOURCE-PRIM", // type (SOURCE-PRIM or SOURCE-SEC)
"text":"Il", // text
"value":"Charles Blé Goudé, le ministre de la Jeunesse de l'ancien président ivoirien Laurent Gbagbo", // normalized value (after coreference resolution)
"indexed_value":"Charles Blé Goudé" // normalized value for indexing (a shorter version of the normalized value, where ambiguity on several names inis removed when it exists). Indexing for further research should be done on this field.
}
...
]
}
...
]}
```
### Training set
Want to compare or reproduce our results? Ask us for our training set, that can be shared under conditions.
### Adapting to other languages
You'll need:
* To annotate a training set (about 300 documents -- 2000 sources -- seems to be a good number)
* To build a few resources (citation verbs, profession list, etc.)
* To have a dependency parser in your language, and ideally a lemmatizer
Contact us if you are interested!
AFP documents only (total 255 documents, 75% train, 10% dev, 15% test)
- BIO, with media list and secondary sources
Summary
TP FP FN Precision Recall F1
SOURCE-PRIM 167 15 68 0,9176 0,7106 0,8010
SOURCE-SEC 16 2 17 0,8889 0,4848 0,6275
Overall 183 17 85 0,9150 0,6828 0,7821
- BIO without media list and secondary sources
Summary
TP FP FN Precision Recall F1
SOURCE-PRIM 167 14 67 0,9227 0,7137 0,8048
SOURCE-SEC 0 0 33 0,0000 0,0000 0,0000
Overall 167 14 100 0,9227 0,6255 0,7455
- IO, with media list and secondary sources
Summary
TP FP FN Precision Recall F1
SOURCE-PRIM 174 16 60 0,9158 0,7436 0,8208
SOURCE-SEC 13 0 20 1,0000 0,3939 0,5652
Overall 187 16 80 0,9212 0,7004 0,7957
- IO, without media list and secondary sources
Summary
TP FP FN Precision Recall F1
SOURCE-PRIM 172 16 62 0,9149 0,7350 0,8152
SOURCE-SEC 0 0 33 0,0000 0,0000 0,0000
Overall 172 16 95 0,9149 0,6442 0,7560
AFP+Web documents (total 363 documents, 75% train, 10% dev, 15% test)
- BIO, with media list and secondary sources
Summary
TP FP FN Precision Recall F1
SOURCE-PRIM 256 28 68 0,9014 0,7901 0,8421
SOURCE-SEC 28 2 22 0,9333 0,5600 0,7000
Overall 284 30 90 0,9045 0,7594 0,8256
- BIO, without media list and secondary sources
Summary
TP FP FN Precision Recall F1
SOURCE-PRIM 256 28 68 0,9014 0,7901 0,8421
SOURCE-SEC 0 0 50 0,0000 0,0000 0,0000
Overall 256 28 118 0,9014 0,6845 0,7781
* commons-cli (Apache License, Version 2.0)
* commons-collections (Apache License, Version 2.0)
* commons-io (Apache License, Version 2.0)
* commons-lang (Apache License, Version 2.0)
* grobid (Apache License, Version 2.0)
* guava (Apache License, Version 2.0)
* hfst-ol (Apache License, Version 2.0)
* liblinear-java (https://github.com/bwaldvogel/liblinear-java/blob/master/COPYRIGHT)
* log4j (Apache License, Version 2.0)
* slf4j (MIT License)
* maltparser (http://www.maltparser.org/license.html)
* maltparser French model has been provided by Candito et al. This model can be used for research purposes provided that you have a (free) license for the French Treebank. If you want to use it for commercial applications, please contact the license holder for the treebank to find out which conditions apply.
* stanford-corenlp (GNU General Public License (v3))
* wapiti-X.jar and native libraries have be compiled by ourselves,
with the help of codes from Grobid and Christophe Boumenot
* lemmatization is an adapted version of Ahmet Aker's code, coming with no license information (http://staffwww.dcs.shef.ac.uk/people/A.Aker/activityNLPProjects.html)
File added
...@@ -55,6 +55,7 @@ ...@@ -55,6 +55,7 @@
<includes> <includes>
<include>lib/**/*</include> <include>lib/**/*</include>
<include>resources/**/*</include> <include>resources/**/*</include>
<include>wapiti-1.5.0/**/*</include>
</includes> </includes>
<excludes> <excludes>
<exclude>lib/jar/**/*</exclude> <exclude>lib/jar/**/*</exclude>
...@@ -107,13 +108,6 @@ ...@@ -107,13 +108,6 @@
<scope>system</scope> <scope>system</scope>
<systemPath>${project.basedir}/lib/jar/stanford-french-corenlp-2016-01-14-models.jar</systemPath> <systemPath>${project.basedir}/lib/jar/stanford-french-corenlp-2016-01-14-models.jar</systemPath>
</dependency> </dependency>
<dependency>
<groupId>fr.limsi.wapiti</groupId>
<artifactId>wapiti-win</artifactId>
<version>1.5.0</version>
<scope>system</scope>
<systemPath>${project.basedir}/lib/jar/wapiti-1.5.0-win.jar</systemPath>
</dependency>
<dependency> <dependency>
<groupId>net.sf</groupId> <groupId>net.sf</groupId>
<artifactId>hfst</artifactId> <artifactId>hfst</artifactId>
...@@ -156,6 +150,11 @@ ...@@ -156,6 +150,11 @@
<artifactId>spring-boot-starter-tomcat</artifactId> <artifactId>spring-boot-starter-tomcat</artifactId>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>
<!-- <dependency> --> <!-- <dependency> -->
<!-- <groupId>org.springframework.boot</groupId> --> <!-- <groupId>org.springframework.boot</groupId> -->
......
FROM frolvlad/alpine-oraclejdk8:slim FROM frolvlad/alpine-java:jdk8-slim
MAINTAINER Bertrand Goupil (bertrand.goupil@afp.com) MAINTAINER Bertrand Goupil (bertrand.goupil@afp.com)
RUN apk update && \ RUN apk update
apk add --no-cache \ RUN apk add --no-cache gcc musl-dev
libstdc++ RUN apk add make
VOLUME /tmp #VOLUME /tmp
VOLUME /configuration #VOLUME /configuration
VOLUME /configuration/lib #VOLUME /configuration/lib
VOLUME /configuration/resources #VOLUME /configuration/resources
VOLUME /default #VOLUME /default
COPY resources /default/resources #COPY resources /default/resources
COPY lib/maltparser /default/lib/maltparser #COPY lib/maltparser /default/lib/maltparser
COPY lib/wapiti_java /default/lib/wapiti_java #COPY lib/wapiti_java /default/lib/wapiti_java
COPY lib/wapiti_models /default/lib/wapiti_models #COPY lib/wapiti_models /default/lib/wapiti_models
#COPY wapiti-1.5.0 /default/wapiti-1.5.0
ADD limsiSourceExtractor.jar limsiSourceExtractor.jar
RUN mkdir /default
WORKDIR /default
COPY lib lib
COPY resources resources
COPY wapiti-1.5.0 wapiti-1.5.0
RUN cd /default/wapiti-1.5.0 && make wapiti && cp wapiti /default/lib
COPY limsiSourceExtractor.jar limsiSourceExtractor.jar
EXPOSE 8080 EXPOSE 8080
RUN sh -c 'touch /limsiSourceExtractor.jar' #RUN sh -c 'touch /limsiSourceExtractor.jar'
#ENV JAVA_OPTS="-Dspring.config.location=/configuration/" #ENV JAVA_OPTS="-Dspring.config.location=/configuration/"
#ENTRYPOINT [ "sh", "-c", "java $JAVA_OPTS -Dspring.config.location=/configuration/ -Djava.security.egd=file:/dev/./urandom -jar /limsiSourceExtractor.jar" ] #ENTRYPOINT [ "sh", "-c", "java $JAVA_OPTS -Dspring.config.location=/configuration/ -Djava.security.egd=file:/dev/./urandom -jar /limsiSourceExtractor.jar" ]
ENTRYPOINT exec java $JAVA_OPTS -Dspring.config.location=/configuration/ -Djava.security.egd=file:/dev/./urandom -jar /limsiSourceExtractor.jar ENTRYPOINT exec java -Djava.security.egd=file:/dev/./urandom -jar /default/limsiSourceExtractor.jar
\ No newline at end of file
...@@ -17,6 +17,7 @@ public class CLIParameters { ...@@ -17,6 +17,7 @@ public class CLIParameters {
private String[] args = null; private String[] args = null;
private Options options = new Options(); private Options options = new Options();
public static final String OPTION_DATA_SPLIT = "s";
public static final String OPTION_DATA_TYPE_LABELED = "l"; public static final String OPTION_DATA_TYPE_LABELED = "l";
public static final String OPTION_CONFIG = "c"; public static final String OPTION_CONFIG = "c";
public static final String OPTION_DATA_TYPE_UNLABELED = "u"; public static final String OPTION_DATA_TYPE_UNLABELED = "u";
...@@ -38,6 +39,7 @@ public class CLIParameters { ...@@ -38,6 +39,7 @@ public class CLIParameters {
OptionGroup optionGroup = new OptionGroup(); OptionGroup optionGroup = new OptionGroup();
Option split = new Option(OPTION_DATA_SPLIT, "split", false, "split data into training/dev/test sets.");
Option labeled = new Option(OPTION_DATA_TYPE_LABELED, "labeled", false, "if test files are labeled."); Option labeled = new Option(OPTION_DATA_TYPE_LABELED, "labeled", false, "if test files are labeled.");
// labeled.setArgName(ARGUMENT_ENCODING); // labeled.setArgName(ARGUMENT_ENCODING);
...@@ -65,6 +67,7 @@ public class CLIParameters { ...@@ -65,6 +67,7 @@ public class CLIParameters {
Option jobNumber = new Option(OPTION_JOB_NUMBER, true, "job number (default is 1 -- no multi-threading)"); Option jobNumber = new Option(OPTION_JOB_NUMBER, true, "job number (default is 1 -- no multi-threading)");
optionGroup.addOption(split);
optionGroup.addOption(labeled); optionGroup.addOption(labeled);
optionGroup.addOption(finalFile); optionGroup.addOption(finalFile);
optionGroup.addOption(finalDir); optionGroup.addOption(finalDir);
...@@ -116,5 +119,3 @@ public class CLIParameters { ...@@ -116,5 +119,3 @@ public class CLIParameters {
} }
} }
...@@ -23,7 +23,7 @@ import fr.limsi.sourceExtractor.training.TrainingUnLabel; ...@@ -23,7 +23,7 @@ import fr.limsi.sourceExtractor.training.TrainingUnLabel;
*/ */
public class MainSourceExtractor { public class MainSourceExtractor {
// Default configuration file // Default configuration file
private static final String DEFAULT_CONFIG_FILE = "config.properties"; private static final String DEFAULT_CONFIG_FILE = "src/main/resources/config.properties";
// Configuration fields // Configuration fields
private static final String DATA_DIR_PROPERTY = "DATA_DIR"; private static final String DATA_DIR_PROPERTY = "DATA_DIR";
...@@ -43,7 +43,6 @@ public class MainSourceExtractor { ...@@ -43,7 +43,6 @@ public class MainSourceExtractor {
} }
public static void main(String[] args) throws IOException, MaltChainedException, InterruptedException, URISyntaxException { public static void main(String[] args) throws IOException, MaltChainedException, InterruptedException, URISyntaxException {
long startTime = System.currentTimeMillis();
// Debug mode // Debug mode
if (args.length == 0) { if (args.length == 0) {
// String argsStr = "-u -b -c // String argsStr = "-u -b -c
...@@ -103,10 +102,16 @@ public class MainSourceExtractor { ...@@ -103,10 +102,16 @@ public class MainSourceExtractor {
config.loadTools(); config.loadTools();
config.loadWapitiNativeLib(); // config.loadWapitiNativeLib();
long startTime = System.currentTimeMillis();
if (cmd.hasOption(CLIParameters.OPTION_DATA_SPLIT)) {
TrainingLabel trainingLabel = new TrainingLabel(config);
trainingLabel.splitTrainDevTest();
}
// if labeled // if labeled
if (cmd.hasOption(CLIParameters.OPTION_DATA_TYPE_LABELED)) { else if (cmd.hasOption(CLIParameters.OPTION_DATA_TYPE_LABELED)) {
TrainingLabel trainingLabel = new TrainingLabel(config); TrainingLabel trainingLabel = new TrainingLabel(config);
trainingLabel.train(modelSuffix, jubNumber); trainingLabel.train(modelSuffix, jubNumber);
......
package fr.limsi.sourceExtractor; package fr.limsi.sourceExtractor;
import com.google.gson.JsonObject;
public class SourceAnnotation implements Comparable<SourceAnnotation> { public class SourceAnnotation implements Comparable<SourceAnnotation> {
private String id; private String id;
private int leftOffset; private int leftOffset;
...@@ -94,22 +96,23 @@ public class SourceAnnotation implements Comparable<SourceAnnotation> { ...@@ -94,22 +96,23 @@ public class SourceAnnotation implements Comparable<SourceAnnotation> {
} }
return main; return main;
} }
public String toJSON() { public JsonObject toJSON() {
String main = "{\"start\":" + leftOffset + ", \"end\":" + rightOffset + JsonObject main = new JsonObject();
", \"type\":\"" + type + "\"" + main.addProperty("start", leftOffset);
", \"text\":\"" + text.replaceAll("\"", "'").trim() + "\""; main.addProperty("end", rightOffset);
main.addProperty("type", type);
if (value != null && value.length() > 0) { main.addProperty("text", text.trim());
main += ", \"value\":\"" + value.replaceAll("\"", "'").trim() + "\"";
if (value != null && value.trim().length() > 0) {
main.addProperty("value", value.trim());
} }
if (indexValue != null && indexValue.length() > 0) { if (indexValue != null && indexValue.trim().length() > 0) {
main += ", \"indexed_value\":\"" + indexValue.replaceAll("\"", "'").trim() + "\""; main.addProperty("indexed_value", indexValue.trim());
} }
if (anonymous) { if (anonymous) {
main += ", \"anonymous\":\"yes\""; main.addProperty("anonymous", "yes");
} }
main += "}";
return main; return main;
} }
} }
...@@ -41,6 +41,8 @@ import org.maltparser.core.exception.MaltChainedException; ...@@ -41,6 +41,8 @@ import org.maltparser.core.exception.MaltChainedException;
import com.google.common.base.Charsets; import com.google.common.base.Charsets;
import com.google.common.io.Files; import com.google.common.io.Files;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
...@@ -568,8 +570,9 @@ public class SourceExtractor { ...@@ -568,8 +570,9 @@ public class SourceExtractor {
TreeSet<SourceAnnotation> orderedAnnotations = new TreeSet<>(); TreeSet<SourceAnnotation> orderedAnnotations = new TreeSet<>();
orderedAnnotations.addAll(annotations.values()); orderedAnnotations.addAll(annotations.values());
StringBuilder jsonResult = new StringBuilder();
jsonResult.append("{\"source_sentences\":[\n"); JsonObject jsonResult = new JsonObject();
JsonArray sentencesArray = new JsonArray();
int annIndex = 0; int annIndex = 0;
...@@ -593,7 +596,7 @@ public class SourceExtractor { ...@@ -593,7 +596,7 @@ public class SourceExtractor {
// New sentence // New sentence
else { else {
if (!sentenceAnnotations.isEmpty()) { if (!sentenceAnnotations.isEmpty()) {
annIndex = appendToJSON(docText, jsonResult, annIndex, sentenceAnnotations, currentSentenceIndex, currentSentenceOffset, nextSentenceOffset); annIndex = appendToJSON(docText, sentencesArray, annIndex, sentenceAnnotations, currentSentenceIndex, currentSentenceOffset, nextSentenceOffset);
sentenceAnnotations.clear(); sentenceAnnotations.clear();
} }
...@@ -615,30 +618,25 @@ public class SourceExtractor { ...@@ -615,30 +618,25 @@ public class SourceExtractor {
// System.out.println("--" + currentSentenceOffset + " " + nextSentenceOffset); // System.out.println("--" + currentSentenceOffset + " " + nextSentenceOffset);
// Last annotations // Last annotations
if (!sentenceAnnotations.isEmpty()) { if (!sentenceAnnotations.isEmpty()) {
annIndex = appendToJSON(docText, jsonResult, annIndex, sentenceAnnotations, currentSentenceIndex, currentSentenceOffset, nextSentenceOffset); annIndex = appendToJSON(docText, sentencesArray, annIndex, sentenceAnnotations, currentSentenceIndex, currentSentenceOffset, nextSentenceOffset);
} }
jsonResult.append("]}\n"); jsonResult.add("source_sentences", sentencesArray);
Files.write(jsonResult.toString(), jsonFile, Charsets.UTF_8); Files.write(jsonResult.toString(), jsonFile, Charsets.UTF_8);
} }
private int appendToJSON(String docText, StringBuilder jsonResult, int annIndex, ArrayList<SourceAnnotation> sentenceAnnotations, int currentSentenceIndex, private int appendToJSON(String docText, JsonArray sentencesArray, int annIndex, ArrayList<SourceAnnotation> sentenceAnnotations, int currentSentenceIndex,
int currentSentenceOffset, int nextSentenceOffset) { int currentSentenceOffset, int nextSentenceOffset) {
if (annIndex > 0) {
jsonResult.append(",");
}
// System.out.println("substring " + currentSentenceOffset + " -> " + (nextSentenceOffset - currentSentenceOffset)); // System.out.println("substring " + currentSentenceOffset + " -> " + (nextSentenceOffset - currentSentenceOffset));
jsonResult.append(" {\"text\":\"" + docText.substring(currentSentenceOffset, nextSentenceOffset).replaceAll("\"", "\\\\\"").trim() + "\","); JsonObject sentenceObject = new JsonObject();
jsonResult.append("\n \"sources\":["); JsonArray sourcesArray = new JsonArray();
int senAnnIndex = 0; sentenceObject.addProperty("text", docText.substring(currentSentenceOffset, nextSentenceOffset));
for (SourceAnnotation sentenceAnnotation : sentenceAnnotations) { for (SourceAnnotation sentenceAnnotation : sentenceAnnotations) {
if (senAnnIndex > 0) { sourcesArray.add(sentenceAnnotation.toJSON());
jsonResult.append(",");
}
jsonResult.append("\n" + sentenceAnnotation.toJSON());
senAnnIndex++;
} }
jsonResult.append(" ]}\n"); sentenceObject.add("sources", sourcesArray);
sentencesArray.add(sentenceObject);
annIndex++; annIndex++;
return annIndex; return annIndex;
} }
...@@ -3311,7 +3309,9 @@ public class SourceExtractor { ...@@ -3311,7 +3309,9 @@ public class SourceExtractor {
// wapiti label -m modelPrim input=tests_files_one_by_one output=DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_PRIM -p // wapiti label -m modelPrim input=tests_files_one_by_one output=DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_PRIM -p
WapitiLabeling wapitiPrim = WapitiLabeling.getWapitiInstance(modelPrim); WapitiLabeling wapitiPrim = WapitiLabeling.getWapitiInstance(modelPrim);
wapitiPrim.wapitiTest(sourceExtractor.paths.DIR_TEST_FILES_PRIM, sourceExtractor.paths.DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_PRIM, jubNumber); wapitiPrim.wapitiTest(sourceExtractor.paths.DIR_TEST_FILES_PRIM, sourceExtractor.paths.DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_PRIM, jubNumber,
modelPrim.getAbsolutePath(),
DIR_LIB.getAbsolutePath());
if (searchSecondary) { if (searchSecondary) {
System.out.println(); System.out.println();
...@@ -3320,7 +3320,9 @@ public class SourceExtractor { ...@@ -3320,7 +3320,9 @@ public class SourceExtractor {
sourceExtractor.paths.DIR_TEST_FILES_SEC, true); sourceExtractor.paths.DIR_TEST_FILES_SEC, true);
// wapiti label -m modelSec input=tests_files_one_by_one output=DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_Sec -p // wapiti label -m modelSec input=tests_files_one_by_one output=DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_Sec -p
WapitiLabeling wapitiSec = WapitiLabeling.getWapitiInstance(modelSec); WapitiLabeling wapitiSec = WapitiLabeling.getWapitiInstance(modelSec);
wapitiSec.wapitiTest(sourceExtractor.paths.DIR_TEST_FILES_SEC, sourceExtractor.paths.DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_SEC, jubNumber); wapitiSec.wapitiTest(sourceExtractor.paths.DIR_TEST_FILES_SEC, sourceExtractor.paths.DIR_BIO_FILES_FOR_CONVERSION_WAPITI_LABELED_SEC, jubNumber,
modelSec.getAbsolutePath(),