Commits (2)
FROM frolvlad/alpine-oraclejdk8:slim
MAINTAINER Duc Cao (tien-duc.cao@inria.fr)
RUN apk add maven
RUN apk add --no-cache gcc musl-dev
RUN apk add make
RUN mkdir /default
WORKDIR /default
COPY pom.xml pom.xml
COPY lib lib
RUN mvn dependency:go-offline -B
RUN mkdir data
COPY src src
COPY run.sh run.sh
COPY resources resources
COPY wapiti-1.5.0 wapiti-1.5.0
RUN mvn clean install
RUN cd /default/wapiti-1.5.0 && make wapiti && cp wapiti /default/lib
ENTRYPOINT ["/bin/sh", "run.sh"]
# To build the Docker image (run this only once)
sudo docker build -t source-extractor .
# To run the source-extractor, you can take "sample_input" as <input_folder>
(sudo) ./run_docker.sh extract <input_folder>
# To evaluate the model's performance, the folder input_labeled contains 363 annotated documents and is provided upon request
Install python3
pip3 install sklearn
(sudo) ./run_docker.sh evaluate input_labeled
Performance report:
F1 score 0.971039182282794
Recall 0.9595959595959596
Precision 0.9827586206896551
F1 score 0.8920863309352518
Recall 0.8378378378378378
Precision 0.9538461538461539
import os
is_correct_output = True
output_folder = 'out/'
expected_output_folder = 'expected_output/'
for file_name in os.listdir(output_folder):
with open(os.path.join(output_folder, file_name)) as f:
out = f.read()
with open(os.path.join(expected_output_folder, file_name)) as f:
expected = f.read()
if out != expected:
print('DIFF', file_name)
is_correct_output = False
if len(os.listdir(output_folder)) != len(os.listdir(expected_output_folder)):
print('Different number of output files')
is_correct_output = False
if is_correct_output:
print('Correct output')
import os
import shutil
import itertools
from sklearn.metrics import f1_score, precision_score, recall_score
def evaluate(test_folder='data/labeled/output_labeled/test_prim'):
# Prepare necessary files
txt_folder = 'data/labeled/input_labeled/'
input_txt_folder = 'data/input_txt/'
if os.path.exists(input_txt_folder):
shutil.rmtree(input_txt_folder, ignore_errors=True)
for file_name in os.listdir(test_folder):
txt_file_name = file_name.replace('.tag', '.txt')
shutil.copyfile(os.path.join(txt_folder, txt_file_name), \
os.path.join(input_txt_folder, txt_file_name))
# Execute run.sh script
out_ann_folder = 'data/out'
if os.path.exists(out_ann_folder):
shutil.rmtree(out_ann_folder, ignore_errors=True)
os.system(f'sudo docker run -v `pwd`/data:/default/data \
source-extractor extract {input_txt_folder} {out_ann_folder} 4')
# Compute scores
gold_standard_ann_folder = 'data/labeled/input_labeled_converted'
y_true_prim, y_pred_prim, y_true_sec, y_pred_sec = list(), list(), list(), list()
def update_y(out_d, gold_d, y_true, y_pred):
for position in itertools.chain(out_d.keys(), gold_d.keys()):
y_true.append(1 if position in gold_d.keys() else 0)
if position in out_d.keys():
def print_scores(y_true, y_pred):
print('F1 score', f1_score(y_true, y_pred))
print('Recall', recall_score(y_true, y_pred))
print('Precision', precision_score(y_true, y_pred))
for file_name in os.listdir(out_ann_folder):
if '.ann' in file_name:
out_d_prim, out_d_sec = read_ann_file(os.path.join(out_ann_folder, file_name))
gold_d_prim, gold_d_sec = read_ann_file(os.path.join(gold_standard_ann_folder, file_name))
update_y(out_d_prim, gold_d_prim, y_true_prim, y_pred_prim)
update_y(out_d_sec, gold_d_sec, y_true_sec, y_pred_sec)
# Print scores
print_scores(y_true_prim, y_pred_prim)
print_scores(y_true_sec, y_pred_sec)
def read_ann_file(file_path):
with open(file_path) as f:
# {char_start-char_end: text}
dict_source_prim, dict_source_sec = dict(), dict()
for line in f.read().splitlines():
if '#' not in line:
tokens = line.split('\t')
if len(tokens) == 4:
_, label_and_chars, text, _ = tokens
elif len(tokens) == 3:
_, label_and_chars, text = tokens
label, char_start, char_end = label_and_chars.split(' ')
if label == 'SOURCE-PRIM':
dict_source_prim[char_start + '-' + char_end] = text
elif label == 'SOURCE-SEC':
dict_source_sec[char_start + '-' + char_end] = text
return dict_source_prim, dict_source_sec
......@@ -4,16 +4,6 @@
<!-- <packaging>war</packaging> -->
......@@ -101,7 +91,7 @@
......@@ -155,12 +145,6 @@
......@@ -171,12 +155,6 @@
<!-- <dependency> -->
......@@ -184,4 +162,4 @@
<!-- <artifactId>spring-boot-starter-actuator</artifactId> -->
<!-- </dependency> -->
\ No newline at end of file
command="java -Dfile.encoding=UTF-8 -classpath target/classes:/root/.m2/repository/edu/stanford/nlp/stanford-corenlp/3.7.0/stanford-corenlp-3.7.0.jar:/root/.m2/repository/com/apple/AppleJavaExtensions/1.4/AppleJavaExtensions-1.4.jar:/root/.m2/repository/de/jollyday/jollyday/0.4.9/jollyday-0.4.9.jar:/root/.m2/repository/javax/xml/bind/jaxb-api/2.2.7/jaxb-api-2.2.7.jar:/root/.m2/repository/org/apache/lucene/lucene-queryparser/4.10.3/lucene-queryparser-4.10.3.jar:/root/.m2/repository/org/apache/lucene/lucene-sandbox/4.10.3/lucene-sandbox-4.10.3.jar:/root/.m2/repository/org/apache/lucene/lucene-analyzers-common/4.10.3/lucene-analyzers-common-4.10.3.jar:/root/.m2/repository/org/apache/lucene/lucene-queries/4.10.3/lucene-queries-4.10.3.jar:/root/.m2/repository/org/apache/lucene/lucene-core/4.10.3/lucene-core-4.10.3.jar:/root/.m2/repository/javax/servlet/javax.servlet-api/3.1.0/javax.servlet-api-3.1.0.jar:/root/.m2/repository/joda-time/joda-time/2.9.9/joda-time-2.9.9.jar:/root/.m2/repository/com/googlecode/efficient-java-matrix-library/ejml/0.23/ejml-0.23.jar:/root/.m2/repository/org/glassfish/javax.json/1.0.4/javax.json-1.0.4.jar:/root/.m2/repository/org/slf4j/slf4j-api/1.7.25/slf4j-api-1.7.25.jar:/root/.m2/repository/com/google/protobuf/protobuf-java/2.6.1/protobuf-java-2.6.1.jar:/root/.m2/repository/org/maltparser/maltparser/1.9.0/maltparser-1.9.0.jar:/root/.m2/repository/log4j/log4j/1.2.16/log4j-1.2.16.jar:/root/.m2/repository/tw/edu/ntu/csie/libsvm/3.1/libsvm-3.1.jar:/root/.m2/repository/de/bwaldvogel/liblinear/1.8/liblinear-1.8.jar:/root/.m2/repository/com/google/guava/guava/21.0/guava-21.0.jar:/root/.m2/repository/org/grobid/grobid-core/0.5.0/grobid-core-0.5.0.jar:/root/.m2/repository/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar:/root/.m2/repository/com/cybozu/labs/langdetect/1.1-20120112/langdetect-1.1-20120112.jar:/root/.m2/repository/com/rockymadden/stringmetric/stringmetric-core_2.10/0.27.3/stringmetric-core_2.10-0.27.3.jar:/root/.m2/repository/org/scala-lang/scala-library/2.10.3/scala-library-2.10.3.jar:/root/.m2/repository/commons-pool/commons-pool/1.6/commons-pool-1.6.jar:/root/.m2/repository/commons-dbutils/commons-dbutils/1.7/commons-dbutils-1.7.jar:/root/.m2/repository/org/apache/httpcomponents/httpclient/4.5.3/httpclient-4.5.3.jar:/root/.m2/repository/org/apache/httpcomponents/httpcore/4.4.8/httpcore-4.4.8.jar:/root/.m2/repository/commons-codec/commons-codec/1.10/commons-codec-1.10.jar:/root/.m2/repository/xerces/xercesImpl/2.11.0/xercesImpl-2.11.0.jar:/root/.m2/repository/net/arnx/jsonic/1.3.10/jsonic-1.3.10.jar:/root/.m2/repository/org/apache/pdfbox/pdfbox/1.8.9/pdfbox-1.8.9.jar:/root/.m2/repository/org/apache/pdfbox/fontbox/1.8.9/fontbox-1.8.9.jar:/root/.m2/repository/org/apache/pdfbox/jempbox/1.8.9/jempbox-1.8.9.jar:/root/.m2/repository/commons-logging/commons-logging/1.1.1/commons-logging-1.1.1.jar:/root/.m2/repository/net/sourceforge/saxon/saxon/ fr.limsi.sourceExtractor.MainSourceExtractor"
if [ "$option" == "evaluate" ]; then
$command -s
if [ "$option" == "extract" ]; then
$command -f $2 -o $3 -b -j $4
if [ "$option" == "evaluate" ]; then
sudo rm -rf data
sudo mkdir -p data/labeled
sudo cp -r $input_folder data/labeled
sudo docker run -v `pwd`/data:/default/data \
source-extractor evaluate
python3 evaluate.py
if [ "$option" == "extract" ]; then
sudo rm -rf data
sudo mkdir -p data
sudo cp -r $input_folder data
sudo docker run -v `pwd`/data:/default/data \
source-extractor extract data/$input_folder $output_folder $num_threads
......@@ -17,7 +17,6 @@ public class CLIParameters {
private String[] args = null;
private Options options = new Options();
public static final String OPTION_DATA_SPLIT = "s";
public static final String OPTION_DATA_TYPE_LABELED = "l";
public static final String OPTION_CONFIG = "c";
public static final String OPTION_DATA_TYPE_UNLABELED = "u";
......@@ -39,7 +38,6 @@ public class CLIParameters {
OptionGroup optionGroup = new OptionGroup();
Option split = new Option(OPTION_DATA_SPLIT, "split", false, "split data into training/dev/test sets.");
Option labeled = new Option(OPTION_DATA_TYPE_LABELED, "labeled", false, "if test files are labeled.");
// labeled.setArgName(ARGUMENT_ENCODING);
......@@ -67,7 +65,6 @@ public class CLIParameters {
Option jobNumber = new Option(OPTION_JOB_NUMBER, true, "job number (default is 1 -- no multi-threading)");
......@@ -115,7 +115,7 @@ public class FrenchLemmatizer {
return match;
public synchronized String getLemma(String token, String pos) {
public String getLemma(String token, String pos) {
String generalType = posMap.get(pos);
// System.out.println(generalType + " " + token);
......@@ -23,7 +23,7 @@ import fr.limsi.sourceExtractor.training.TrainingUnLabel;
public class MainSourceExtractor {
// Default configuration file
private static final String DEFAULT_CONFIG_FILE = "src/main/resources/config.properties";
private static final String DEFAULT_CONFIG_FILE = "config.properties";
// Configuration fields
private static final String DATA_DIR_PROPERTY = "DATA_DIR";
......@@ -43,6 +43,7 @@ public class MainSourceExtractor {
public static void main(String[] args) throws IOException, MaltChainedException, InterruptedException, URISyntaxException {
long startTime = System.currentTimeMillis();
// Debug mode
if (args.length == 0) {
// String argsStr = "-u -b -c
......@@ -103,15 +104,9 @@ public class MainSourceExtractor {
long startTime = System.currentTimeMillis();
if (cmd.hasOption(CLIParameters.OPTION_DATA_SPLIT)) {
TrainingLabel trainingLabel = new TrainingLabel(config);
// if labeled
else if (cmd.hasOption(CLIParameters.OPTION_DATA_TYPE_LABELED)) {
if (cmd.hasOption(CLIParameters.OPTION_DATA_TYPE_LABELED)) {
TrainingLabel trainingLabel = new TrainingLabel(config);
trainingLabel.train(modelSuffix, jubNumber);
......@@ -104,9 +104,9 @@ public class SourceExtractorConfig {
public SourceExtractorConfig(File dataDir, File dirLib, File dirResources) {
this.DATA_DIR = new File(dataDir.getAbsolutePath());
this.DIR_LIB = new File(dirLib.getAbsolutePath());
this.DIR_RESOURCES = new File(dirResources.getAbsolutePath());
this.DATA_DIR = dataDir;
this.DIR_LIB = dirLib;
this.DIR_RESOURCES = dirResources;
public void trainedConfig() {
......@@ -362,10 +362,6 @@ public class SourceExtractorConfig {
public Paths getPaths() {
return paths;
public File getDirLib() {
return DIR_LIB;
public Memory getMemory() {
return memory;
......@@ -687,7 +687,6 @@ public abstract class AProcess extends AProcessSupport {
assert fieldsResultWapiti[SourceExtractorConstant.numberFieldsPrim].endsWith(sourceExt)
|| fieldsResultWapiti[SourceExtractorConstant.numberFieldsPrim].equals(SourceExtractorConstant.OUT);
if (index + gapForEndOffset < listOfFieldsResultWapiti.size()
&& SourceExtractorConstant.numberFieldsPrim < fieldsResultWapiti.length
&& fieldsResultWapiti[SourceExtractorConstant.numberFieldsPrim].equals(SourceExtractorConstant.encodingI + sourceExt)) {
while (index + gapForEndOffset < listOfFieldsResultWapiti.size() && fieldsResultWapiti.length > 1
......@@ -746,7 +745,6 @@ public abstract class AProcess extends AProcessSupport {
assert fieldsResultWapitiSec[SourceExtractorConstant.numberFieldsSec].endsWith(sourceExt)
|| fieldsResultWapitiSec[SourceExtractorConstant.numberFieldsSec].equals(SourceExtractorConstant.OUT);
if (index + gapForEndOffset < listOfFieldsResultWapiti.size()
&& SourceExtractorConstant.numberFieldsSec < fieldsResultWapitiSec.length
&& fieldsResultWapitiSec[SourceExtractorConstant.numberFieldsSec].equals(SourceExtractorConstant.encodingI + sourceExt)
&& listOfFieldsResultWapiti.get(index + gapForEndOffset).length != 1) {
while (index + gapForEndOffset < listOfFieldsResultWapiti.size() && fieldsResultWapitiSec.length != 1
......@@ -84,10 +84,7 @@ public class ExtractSource extends TrainingUnLabel {
WapitiLabeling wapitiPrim = WapitiLabeling.getWapitiInstance(modelPrim);
// wapiti label -m modelPrim input=tests_files_one_by_one
if (searchSecondary) {
......@@ -98,10 +95,7 @@ public class ExtractSource extends TrainingUnLabel {
// wapiti label -m modelSec input=tests_files_one_by_one
// -p
......@@ -15,25 +15,6 @@ public class TrainingLabel extends AProcess {
public TrainingLabel(SourceExtractorConfig extractorConfig) {
public void splitTrainDevTest() throws IOException, InterruptedException {
// we convert all input annotation files into SOURCE-PRIM and
System.out.println("Transformation en SOURCE-PRIM et SOURCE-SEC réussie.");
// we start with tagging each files and launch maltparser on these
// same files. We eventually end with the (B)IO conversion.
System.out.println("Tous les fichiers ont été transformés en (B)IO sans problème.");
public void train(String modelSuffix, int jubNumber) throws IOException, InterruptedException {
if (!org.apache.commons.lang3.SystemUtils.IS_OS_LINUX) {
package fr.limsi.sourceExtractor.wapiti;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ForkJoinPool;
......@@ -12,59 +9,22 @@ import java.util.concurrent.ForkJoinPool;
import org.apache.commons.lang3.SystemUtils;
import fr.limsi.sourceExtractor.ProcessingThreadFactory;
import fr.limsi.wapiti.Wapiti;
public abstract class WapitiLabeling {
public abstract void label(File input, File outputDir, String filename)
public abstract void label(File input, File outputDir, String filename)
throws UnsupportedEncodingException, FileNotFoundException;
* Méthode permettant d'utiliser wapiti sur les fichiers de tests
* @param input Fichier ou dossier à tester
* @param modelsFile Le modèle wapiti
* @param outputDir Le dossier de sortie des fichiers
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
* @throws InterruptedException
* @param input Fichier ou dossier à tester
* @param modelsFile Le modèle wapiti
* @param outputDir Le dossier de sortie des fichiers
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
* @throws InterruptedException
public void wapitiTest(File inputDir, File outputDir, int jobNumber,
String modelFile, String dirLib)
throws UnsupportedEncodingException, FileNotFoundException, InterruptedException {
if (!outputDir.exists()) {
try {
String command = String.format("%s/wapiti"
+ " label -m "
+ "%s "
+ "-i %s/ "
+ "-o %s/ -p", dirLib, modelFile, inputDir.getAbsolutePath(), outputDir.getAbsolutePath());
String line;
Process p = Runtime.getRuntime().exec(command);
BufferedReader bri = new BufferedReader(new InputStreamReader(p.getInputStream()));
BufferedReader bre = new BufferedReader(new InputStreamReader(p.getErrorStream()));
while ((line = bri.readLine()) != null) {
while ((line = bre.readLine()) != null) {
} catch (IOException e) {
public void wapitiTest(File input, File outputDir, int jobNumber)
throws UnsupportedEncodingException, FileNotFoundException, InterruptedException {
public void wapitiTest(File input, File outputDir, int jobNumber) throws UnsupportedEncodingException, FileNotFoundException, InterruptedException {
if (!outputDir.exists()) {
......@@ -91,36 +51,39 @@ public abstract class WapitiLabeling {
throw new RuntimeException();
public void wapitiLabeled(File input, String fileId, String extension, File outputDir) {
public void wapitiLabeled(File input, String fileId, String extension, File outputDir){
if (!outputDir.exists()) {
File inFile = new File(input, fileId + "." + extension);
File inFile = new File(input, fileId+"."+extension);
WapitiTask task = new WapitiTask(inFile, outputDir, this);
// Doublon par rapport a WapitiTask
* private static void wapitiLabelFile(File input, File outputDir,
* WapitiLabeling wapitiLabeler) throws UnsupportedEncodingException,
* FileNotFoundException { String filename = input.getName();
* int pos = filename.lastIndexOf(".");
* if (pos > 0) { filename = filename.substring(0, pos); }
* wapitiLabeler.label(input, outputDir, filename); }
public static WapitiLabeling getWapitiInstance(File modelFile)
//Doublon par rapport a WapitiTask
/*private static void wapitiLabelFile(File input, File outputDir, WapitiLabeling wapitiLabeler)
throws UnsupportedEncodingException, FileNotFoundException {
String filename = input.getName();
int pos = filename.lastIndexOf(".");
if (pos > 0) {
filename = filename.substring(0, pos);
wapitiLabeler.label(input, outputDir, filename);
public static WapitiLabeling getWapitiInstance(File modelFile) throws UnsupportedEncodingException, FileNotFoundException {
if (org.apache.commons.lang3.SystemUtils.IS_OS_LINUX || SystemUtils.IS_OS_MAC) {
return new WapitiLabelingLinux(modelFile);
} else if (SystemUtils.IS_OS_WINDOWS) {
else if (SystemUtils.IS_OS_WINDOWS) {
return new WapitiLabelingWindows(modelFile);
} else {
else {
throw new RuntimeException("Unknown OS");
## Directory containing the librairies and models
## Directory containing the language-dependent resources
## DATA_DIR should only be set correctly for training the models
## The directory is useless in production mode
sudo docker rm $(sudo docker ps -qa)
Wapiti - A linear-chain CRF tool
Copyright (c) 2009-2013 CNRS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
Release v1.5.0: Update mode and bug fixes
Add precision specifier for model dumping
Add update mode to modify a model
Lots of english corrections in the manual
Fix bug in model format compatibility
Fix memory allocation with large models
Fix small memory bug in quark database
Fix bug with bigram features in raw mode
Release v1.4.0: Forced decoding, optimizer state, and bug fixes
Add forced decoding to partialy decode sequences
Add optimizer state saving for L-BFGS and R-PROP
Switched to elapsed time instead of wall time in progress
Fix bug in Makefile (thanks to Lars Buitinck)
Fix local normalization decoding in MEMM (thanks to Anoop Deoras)
Fix bad handling of objwin option
Fix bug in reader for single letter obs and lbl
Release v1.3.0: MEMM, faster gradient and bug fixes
Added support for Maximum Entropy Markov Models.
Added use of atomic operation in gradient computation
Improved RProp numerical stability
Fix bug with unseen features in raw mode (thanks to Jurgen Van Gael)
Fix bug discarding some features in maxent mode (thanks to George Foster)
Switched code to stdint, should resolve some issue with size_t
on exotic systems.
Release v1.2.0: RProp improvements and bug fix
Switch from splay-trees to critbit-tries
Add RProp+ and RProp- variants of the RProp algorithm
Add a new projection scheme for RProp with l1
Make maxent work with sequences
Add space matching in regexp
Fix a few small bugs
Release v1.1.3: Some small improvements
New option --jobsize for fine grained multi threading
Improved SGD index construction : a lot faster
Fix a small bug in sparse multi-threaded gradient
Release v1.1.2: Bug fix release
Fix a small bug in L-BFGS/OWL-QN, should improve a bit
convergence speed in some case.
Fix a bug in multi-thread job system thanks to Alexander Fraser,
should fix error rates and training speed on large dataset.
Fix two small memory leaks.
Some improvment in quark database handling.
Release v1.1.1: Mainly multi-threading improvements
RPROP algorithm is now fully multi-threaded.
Error rate estimation during training is now multi-threaded.
Better jobs scheduling in multi-threaded gradient.
Multi-threading code can be disabled (compilation on Windows should
be simpler).
Fixed bug in L1 optimization with RPROP (should improve stability).
Release v1.1.0: A few new features
Added maxent mode.
Added decoding through posteriors, this should improve accuracy
at the price of computational time.
Added the RPROP optimization algorithm.
Added absolute indexing in patterns.
Changed the scored output format as the posterior decoding
provide normalized score at each position. The output is now
compatible with CRF++.
Some code cleanup.
Release v1.0.2: Mainly a bug fix version.
Fixed some memory leaks, thanks to David Keeler
Fixed argument processing to be more user friendly
Fixed small bug in model compaction
Added reading of raw files
Spell corrections in man page
Release v1.0.0: Initial public version.
Wapiti installation
If you have a recent compiler, normally you can just do the classical:
make install
switch to super user for the second. If you want to install somewhere else than
in /usr/local you will have to edit the variable definitions at the head of the
You can disable the non C99 compliant features by modifying the wapiti.h in the
src/ directory. This should allow you to compile Wapiti on almost any platform
who have a C99 compiler.
CFLAGS =-std=c99 -W -Wall -Wextra -O3
LIBS =-lm -lpthread
PREFIX =/usr/local
INSTALL= install -p
wapiti: $(SRC) $(HDR)
@echo "CC: wapiti.c --> wapiti"
@$(CC) -DNDEBUG $(CFLAGS) -o wapiti $(SRC) $(LIBS)
debug: $(SRC) $(HDR)
@echo "CC: wapiti.c --> wapiti"
@$(CC) -g $(CFLAGS) -o wapiti $(SRC) $(LIBS)
install: wapiti
@echo "CP: wapiti --> $(DESTDIR)$(PREFIX)/bin"
@mkdir -p $(DESTDIR)$(PREFIX)/bin
@mkdir -p $(DESTDIR)$(PREFIX)/share/man/man1
@$(INSTALL_DATA) doc/wapiti.1 $(DESTDIR)$(PREFIX)/share/man/man1
@echo "RM: wapiti"
@rm -f wapiti
.PHONY: clean install
# Wapiti - A linear-chain CRF tool
Copyright (c) 2009-2013 CNRS
All rights reserved.
For more detailed information see the [homepage](http://wapiti.limsi.fr).
Wapiti is a very fast toolkit for segmenting and labeling sequences with
discriminative models. It is based on maxent models, maximum entropy Markov
models and linear-chain CRF and proposes various optimization and regularization
methods to improve both the computational complexity and the prediction
performance of standard models. Wapiti is ranked first on the sequence tagging
task for more than a year on MLcomp web site.
Wapiti is developed by LIMSI-CNRS and was partially funded by ANR projects
CroTaL (ANR-07-MDCO-003) and MGA (ANR-07-BLAN-0311-02).
For suggestions, comments, or patchs, you can contact me at lavergne@limsi.fr
If you use Wapiti for research purpose, please use the following citation:
author = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon,
title = {Practical Very Large Scale {CRFs}},
booktitle = {Proceedings the 48th Annual Meeting of the Association
for Computational Linguistics ({ACL})},
month = {July},
year = {2010},
location = {Uppsala, Sweden},
publisher = {Association for Computational Linguistics},
pages = {504--513},
url = {http://www.aclweb.org/anthology/P10-1052}
U:Wrd-1 X=%x[ 0,0]
U:wrd-1 L=%X[-1,0]
U:wrd-1 X=%X[ 0,0]
U:wrd-1 R=%X[ 1,0]
U:wrd-1RR=%X[ 2,0]
U:wrd-2 L=%X[-1,0]/%X[ 0,0]
U:wrd-2 R=%X[ 0,0]/%X[ 1,0]
*:Pos-1 L=%x[-1,1]
*:Pos-1 X=%x[ 0,1]
*:Pos-1 R=%x[ 1,1]
*:Pos-1RR=%x[ 2,1]
U:Pos-2 L=%X[-1,1]/%X[ 0,1]
U:Pos-2 R=%X[ 0,1]/%X[ 1,1]
*:Pre-1 X=%m[ 0,0,"^.?"]
*:Pre-2 X=%m[ 0,0,"^.?.?"]
*:Pre-3 X=%m[ 0,0,"^.?.?.?"]
*:Pre-4 X=%m[ 0,0,"^.?.?.?.?"]
*:Suf-1 X=%m[ 0,0,".?$"]
*:Suf-2 X=%m[ 0,0,".?.?$"]
*:Suf-3 X=%m[ 0,0,".?.?.?$"]
*:Suf-4 X=%m[ 0,0,".?.?.?.?$"]
*:Caps? L=%t[-1,0,"\u"]
*:Caps? X=%t[ 0,0,"\u"]
*:Caps? R=%t[ 1,0,"\u"]
*:AllC? X=%t[ 0,0,"^\u*$"]
*:BegC? X=%t[ 0,0,"^\u"]
*:Punc? L=%t[-1,0,"\p"]
*:Punc? X=%t[ 0,0,"\p"]
*:Punc? R=%t[ 1,0,"\p"]
*:AllP? X=%t[ 0,0,"^\p*$"]
*:InsP? X=%t[ 0,0,".\p."]
*:Numb? L=%t[-1,0,"\d"]
*:Numb? X=%t[ 0,0,"\d"]
*:Numb? R=%t[ 1,0,"\d"]
*:AllN? X=%t[ 0,0,"^\d*$"]
