Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Duc Cao
source-extractor
Commits
7423d71b
Commit
7423d71b
authored
Nov 07, 2017
by
Bertrand Goupil
Browse files
Use of the news ID
- Change UUI temp generation with news Id - Add news id in the json response
parent
03100854
Changes
7
Show whitespace changes
Inline
Side-by-side
.factorypath
View file @
7423d71b
...
@@ -40,9 +40,9 @@
...
@@ -40,9 +40,9 @@
<factorypathentry kind="VARJAR" id="M2_REPO/net/sourceforge/saxon/saxon/9.1.0.8/saxon-9.1.0.8.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/net/sourceforge/saxon/saxon/9.1.0.8/saxon-9.1.0.8.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/xom/xom/1.2.5/xom-1.2.5.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/xom/xom/1.2.5/xom-1.2.5.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/xalan/xalan/2.7.0/xalan-2.7.0.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/xalan/xalan/2.7.0/xalan-2.7.0.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/
Documents/eclipse-workspace/afp-asrael
/Limsi-SourceExtractor/lib/jar/stanford-french-corenlp-2016-01-14-models.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/
git/code
/Limsi-SourceExtractor/lib/jar/stanford-french-corenlp-2016-01-14-models.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/
Documents/eclipse-workspace/afp-asrael
/Limsi-SourceExtractor/lib/jar/wapiti-1.5.0-win.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/
git/code
/Limsi-SourceExtractor/lib/jar/wapiti-1.5.0-win.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/
Documents/eclipse-workspace/afp-asrael
/Limsi-SourceExtractor/lib/jar/hfst-ol.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="EXTJAR" id="/Users/bertrand/
git/code
/Limsi-SourceExtractor/lib/jar/hfst-ol.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/commons-io/commons-io/2.5/commons-io-2.5.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/commons-io/commons-io/2.5/commons-io-2.5.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/org/apache/commons/commons-lang3/3.4/commons-lang3-3.4.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/org/apache/commons/commons-lang3/3.4/commons-lang3-3.4.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/commons-cli/commons-cli/1.3.1/commons-cli-1.3.1.jar" enabled="true" runInBatchMode="false"/>
<factorypathentry kind="VARJAR" id="M2_REPO/commons-cli/commons-cli/1.3.1/commons-cli-1.3.1.jar" enabled="true" runInBatchMode="false"/>
...
...
configuration/application.yml
View file @
7423d71b
server
:
server
:
context-path
:
/sourceExtractor
context-path
:
/
limsi-
sourceExtractor
resource
:
resource
:
#Directory containing the librairies and models
#Directory containing the librairies and models
...
...
pom.xml
View file @
7423d71b
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
<parent>
<parent>
<groupId>
org.springframework.boot
</groupId>
<groupId>
org.springframework.boot
</groupId>
<artifactId>
spring-boot-starter-parent
</artifactId>
<artifactId>
spring-boot-starter-parent
</artifactId>
<version>
1.5.
2
.RELEASE
</version>
<version>
1.5.
8
.RELEASE
</version>
</parent>
</parent>
<name>
LimsiSourceExtractor
</name>
<name>
LimsiSourceExtractor
</name>
<properties>
<properties>
...
@@ -156,6 +156,7 @@
...
@@ -156,6 +156,7 @@
<artifactId>
spring-boot-starter-tomcat
</artifactId>
<artifactId>
spring-boot-starter-tomcat
</artifactId>
<scope>
provided
</scope>
<scope>
provided
</scope>
</dependency>
</dependency>
<!-- <dependency> -->
<!-- <dependency> -->
<!-- <groupId>org.springframework.boot</groupId> -->
<!-- <groupId>org.springframework.boot</groupId> -->
<!-- <artifactId>spring-boot-starter-actuator</artifactId> -->
<!-- <artifactId>spring-boot-starter-actuator</artifactId> -->
...
...
src/main/java/fr/limsi/sourceExtractor/application/process/SimplePreprocessing.java
View file @
7423d71b
...
@@ -4,7 +4,6 @@ import java.io.File;
...
@@ -4,7 +4,6 @@ import java.io.File;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.io.InputStream
;
import
java.io.StringWriter
;
import
java.io.StringWriter
;
import
java.util.UUID
;
import
java.util.regex.Matcher
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
...
@@ -22,11 +21,12 @@ import fr.limsi.sourceExtractor.training.AProcessSupport;
...
@@ -22,11 +21,12 @@ import fr.limsi.sourceExtractor.training.AProcessSupport;
public
class
SimplePreprocessing
{
public
class
SimplePreprocessing
{
private
Pattern
pattern
;
private
Pattern
pattern
,
patternIdentifier
;
private
AProcessSupport
processSupport
;
private
AProcessSupport
processSupport
;
public
SimplePreprocessing
(
AProcessSupport
processSupport
)
{
public
SimplePreprocessing
(
AProcessSupport
processSupport
)
{
this
.
pattern
=
Pattern
.
compile
(
"<p>([^<]+)</p>"
);
this
.
pattern
=
Pattern
.
compile
(
"<p>([^<]+)</p>"
);
this
.
patternIdentifier
=
Pattern
.
compile
(
"<PublicIdentifier>([^<]+)</PublicIdentifier>"
);
this
.
processSupport
=
processSupport
;
this
.
processSupport
=
processSupport
;
}
}
...
@@ -36,8 +36,10 @@ public class SimplePreprocessing {
...
@@ -36,8 +36,10 @@ public class SimplePreprocessing {
Paths
paths
=
config
.
getPaths
();
Paths
paths
=
config
.
getPaths
();
Tools
tools
=
config
.
getTools
();
Tools
tools
=
config
.
getTools
();
Resources
resources
=
config
.
getResources
();
Resources
resources
=
config
.
getResources
();
String
fileId
=
UUID
.
randomUUID
().
toString
();
String
xmlText
=
toString
(
contentStream
);
String
text
=
extractTextFromXML
(
fileId
,
contentStream
,
memory
);
//String fileId = UUID.randomUUID().toString();
String
fileId
=
extractPublicIdentifier
(
xmlText
);
String
text
=
extractTextFromXML
(
fileId
,
xmlText
,
memory
);
// we create the files in which we will write the results
// we create the files in which we will write the results
File
outFilePRIM
=
DIRUtils
.
createDirAndFilesWithExt
(
paths
.
DIR_TEST_FILES_UNLABELED_PRIM
,
fileId
,
".tag"
);
File
outFilePRIM
=
DIRUtils
.
createDirAndFilesWithExt
(
paths
.
DIR_TEST_FILES_UNLABELED_PRIM
,
fileId
,
".tag"
);
File
outFileSEC
=
DIRUtils
.
createDirAndFilesWithExt
(
paths
.
DIR_TEST_FILES_UNLABELED_SEC
,
fileId
,
".tag"
);
File
outFileSEC
=
DIRUtils
.
createDirAndFilesWithExt
(
paths
.
DIR_TEST_FILES_UNLABELED_SEC
,
fileId
,
".tag"
);
...
@@ -51,10 +53,25 @@ public class SimplePreprocessing {
...
@@ -51,10 +53,25 @@ public class SimplePreprocessing {
return
fileId
;
return
fileId
;
}
}
private
String
extractTextFromXML
(
String
fileId
,
InputStream
contentStream
,
Memory
memory
)
throws
IOException
{
private
String
toString
(
InputStream
contentStream
)
throws
IOException
{
StringWriter
writer
=
new
StringWriter
();
StringWriter
writer
=
new
StringWriter
();
IOUtils
.
copy
(
contentStream
,
writer
,
Charsets
.
UTF_8
);
IOUtils
.
copy
(
contentStream
,
writer
,
Charsets
.
UTF_8
);
String
xmlText
=
writer
.
toString
();
String
xmlText
=
writer
.
toString
();
return
xmlText
;
}
private
String
extractPublicIdentifier
(
String
xmlText
)
{
Matcher
matcher
=
this
.
patternIdentifier
.
matcher
(
xmlText
);
String
fileId
=
""
;
while
(
matcher
.
find
())
{
fileId
=
matcher
.
group
(
1
);
}
fileId
.
replaceAll
(
"\\r"
,
""
);
return
fileId
;
}
private
String
extractTextFromXML
(
String
fileId
,
String
xmlText
,
Memory
memory
)
{
Matcher
matcher
=
this
.
pattern
.
matcher
(
xmlText
);
Matcher
matcher
=
this
.
pattern
.
matcher
(
xmlText
);
StringBuilder
result
=
new
StringBuilder
();
StringBuilder
result
=
new
StringBuilder
();
while
(
matcher
.
find
())
{
while
(
matcher
.
find
())
{
...
...
src/main/java/fr/limsi/sourceExtractor/training/AProcess.java
View file @
7423d71b
...
@@ -963,7 +963,8 @@ public abstract class AProcess extends AProcessSupport {
...
@@ -963,7 +963,8 @@ public abstract class AProcess extends AProcessSupport {
TreeSet
<
SourceAnnotation
>
orderedAnnotations
=
new
TreeSet
<>();
TreeSet
<
SourceAnnotation
>
orderedAnnotations
=
new
TreeSet
<>();
orderedAnnotations
.
addAll
(
annotations
.
values
());
orderedAnnotations
.
addAll
(
annotations
.
values
());
StringBuilder
jsonResult
=
new
StringBuilder
();
StringBuilder
jsonResult
=
new
StringBuilder
();
jsonResult
.
append
(
"{\"source_sentences\":[\n"
);
jsonResult
.
append
(
"{\"identifier\":\""
+
fileId
+
"\",\n"
);
jsonResult
.
append
(
"\"source_sentences\":[\n"
);
int
annIndex
=
0
;
int
annIndex
=
0
;
...
...
src/main/resources/config.properties
View file @
7423d71b
## Directory containing the librairies and models
## Directory containing the librairies and models
LIB_DIR
=
/Users/bertrand/
Documents/eclipse-workspace/afp-asrael
/Limsi-SourceExtractor/lib
LIB_DIR
=
/Users/bertrand/
git/code
/Limsi-SourceExtractor/lib
## Directory containing the language-dependent resources
## Directory containing the language-dependent resources
RESOURCES_DIR
=
/Users/bertrand/
Documents/eclipse-workspace/afp-asrael
/Limsi-SourceExtractor/resources
RESOURCES_DIR
=
/Users/bertrand/
git/code/
/Limsi-SourceExtractor/resources
## DATA_DIR should only be set correctly for training the models
## DATA_DIR should only be set correctly for training the models
## The directory is useless in production mode
## The directory is useless in production mode
...
...
src/test/configuration/application.yml
View file @
7423d71b
...
@@ -3,9 +3,9 @@ server:
...
@@ -3,9 +3,9 @@ server:
resource
:
resource
:
#Directory containing the librairies and models
#Directory containing the librairies and models
lib
:
/Users/bertrand/
Documents/eclipse-workspace/afp-asrael
/Limsi-SourceExtractor/lib
lib
:
/Users/bertrand/
git/code
/Limsi-SourceExtractor/lib
#Directory containing the language-dependent resources
#Directory containing the language-dependent resources
resources
:
/Users/bertrand/
Documents/eclipse-workspace/afp-asrael
/Limsi-SourceExtractor/resources
resources
:
/Users/bertrand/
git/code
/Limsi-SourceExtractor/resources
#Directory containing trained data
#Directory containing trained data
data
:
/home/xtannier/Recherche/SourceExtractor
data
:
/home/xtannier/Recherche/SourceExtractor
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment