Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
S
source-extractor
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
asrael
source-extractor
Commits
42a0c05a
Commit
42a0c05a
authored
Sep 13, 2018
by
Bertrand Goupil
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add NewsML revisionDate in Json response
parent
d3a81554
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
826 additions
and
836 deletions
+826
-836
src/main/java/fr/limsi/sourceExtractor/Memory.java
src/main/java/fr/limsi/sourceExtractor/Memory.java
+6
-4
src/main/java/fr/limsi/sourceExtractor/SourceExtractor.java
src/main/java/fr/limsi/sourceExtractor/SourceExtractor.java
+783
-826
src/main/java/fr/limsi/sourceExtractor/application/configuration/SourceExtractorConfig.java
...ctor/application/configuration/SourceExtractorConfig.java
+3
-3
src/main/java/fr/limsi/sourceExtractor/application/process/SimplePreprocessing.java
...rceExtractor/application/process/SimplePreprocessing.java
+32
-2
src/main/java/fr/limsi/sourceExtractor/training/AProcess.java
...main/java/fr/limsi/sourceExtractor/training/AProcess.java
+2
-1
No files found.
src/main/java/fr/limsi/sourceExtractor/Memory.java
View file @
42a0c05a
...
...
@@ -7,19 +7,21 @@ public class Memory {
public
ConcurrentHashMap
<
String
,
String
>
parsedTexts
;
public
ConcurrentHashMap
<
String
,
String
>
docTexts
;
public
ConcurrentHashMap
<
String
,
ArrayList
<
Integer
>>
sentenceOffsetsByFile
;
public
ConcurrentHashMap
<
String
,
String
>
revisionDate
;
public
Memory
(
ConcurrentHashMap
<
String
,
String
>
parsedTexts
,
ConcurrentHashMap
<
String
,
String
>
docTexts
,
ConcurrentHashMap
<
String
,
ArrayList
<
Integer
>>
sentenceOffsetsByFile
)
{
ConcurrentHashMap
<
String
,
ArrayList
<
Integer
>>
sentenceOffsetsByFile
,
ConcurrentHashMap
<
String
,
String
>
revisionDate
)
{
this
.
parsedTexts
=
parsedTexts
;
this
.
docTexts
=
docTexts
;
this
.
sentenceOffsetsByFile
=
sentenceOffsetsByFile
;
this
.
revisionDate
=
revisionDate
;
}
public
void
cleanEntry
(
String
fileId
)
{
this
.
parsedTexts
.
remove
(
fileId
);
this
.
docTexts
.
remove
(
fileId
);
this
.
sentenceOffsetsByFile
.
remove
(
fileId
);
this
.
revisionDate
.
remove
(
fileId
);
}
}
src/main/java/fr/limsi/sourceExtractor/SourceExtractor.java
View file @
42a0c05a
This source diff could not be displayed because it is too large. You can
view the blob
instead.
src/main/java/fr/limsi/sourceExtractor/application/configuration/SourceExtractorConfig.java
View file @
42a0c05a
...
...
@@ -85,9 +85,9 @@ public class SourceExtractorConfig {
loadWapitiModels
(
""
);
//config();
}
@PreDestroy
public
void
destroy
(){
public
void
destroy
()
{
logger
.
debug
(
"destroy"
);
try
{
FileUtils
.
forceDelete
(
this
.
tempDir
);
...
...
@@ -99,7 +99,7 @@ public class SourceExtractorConfig {
public
SourceExtractorConfig
()
{
this
.
paths
=
new
Paths
();
this
.
memory
=
new
Memory
(
new
ConcurrentHashMap
<>(),
new
ConcurrentHashMap
<>(),
new
ConcurrentHashMap
<>());
this
.
memory
=
new
Memory
(
new
ConcurrentHashMap
<>(),
new
ConcurrentHashMap
<>(),
new
ConcurrentHashMap
<>()
,
new
ConcurrentHashMap
<>()
);
}
public
SourceExtractorConfig
(
File
dataDir
,
File
dirLib
,
File
dirResources
)
{
...
...
src/main/java/fr/limsi/sourceExtractor/application/process/SimplePreprocessing.java
View file @
42a0c05a
...
...
@@ -4,6 +4,9 @@ import java.io.File;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.io.StringWriter
;
import
java.text.ParseException
;
import
java.text.SimpleDateFormat
;
import
java.util.Date
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
...
...
@@ -21,13 +24,18 @@ import fr.limsi.sourceExtractor.training.AProcessSupport;
public
class
SimplePreprocessing
{
private
Pattern
pattern
,
patternIdentifier
;
private
Pattern
pattern
,
patternIdentifier
,
patternCreationDate
;
private
AProcessSupport
processSupport
;
private
SimpleDateFormat
utcFormat
,
format2
;
public
SimplePreprocessing
(
AProcessSupport
processSupport
)
{
this
.
pattern
=
Pattern
.
compile
(
"<p>([^<]+)</p>"
);
this
.
patternIdentifier
=
Pattern
.
compile
(
"<PublicIdentifier>([^<]+)</PublicIdentifier>"
);
this
.
patternCreationDate
=
Pattern
.
compile
(
"<ThisRevisionCreated>([^<]+)</ThisRevisionCreated>"
);
this
.
processSupport
=
processSupport
;
this
.
utcFormat
=
new
SimpleDateFormat
(
"yyyyMMdd'T'HHmmss'Z'"
);
this
.
format2
=
new
SimpleDateFormat
(
"yyyyMMdd'T'HHmmssXX"
);
}
public
String
tagXMLText
(
InputStream
contentStream
,
SourceExtractorConfig
config
)
throws
IOException
{
...
...
@@ -40,6 +48,7 @@ public class SimplePreprocessing {
//String fileId = UUID.randomUUID().toString();
String
fileId
=
extractPublicIdentifier
(
xmlText
);
String
text
=
extractTextFromXML
(
fileId
,
xmlText
,
memory
);
extractRevisionDate
(
fileId
,
xmlText
,
memory
);
// we create the files in which we will write the results
File
outFilePRIM
=
DIRUtils
.
createDirAndFilesWithExt
(
paths
.
DIR_TEST_FILES_UNLABELED_PRIM
,
fileId
,
".tag"
);
File
outFileSEC
=
DIRUtils
.
createDirAndFilesWithExt
(
paths
.
DIR_TEST_FILES_UNLABELED_SEC
,
fileId
,
".tag"
);
...
...
@@ -62,7 +71,7 @@ public class SimplePreprocessing {
private
String
extractPublicIdentifier
(
String
xmlText
)
{
Matcher
matcher
=
this
.
patternIdentifier
.
matcher
(
xmlText
);
String
fileId
=
""
;
String
fileId
=
""
;
while
(
matcher
.
find
())
{
fileId
=
matcher
.
group
(
1
);
}
...
...
@@ -70,6 +79,27 @@ public class SimplePreprocessing {
return
fileId
;
}
private
void
extractRevisionDate
(
String
fileId
,
String
xmlText
,
Memory
memory
)
{
Matcher
matcher
=
this
.
patternCreationDate
.
matcher
(
xmlText
);
String
revisionDateStr
=
""
;
while
(
matcher
.
find
())
{
revisionDateStr
=
matcher
.
group
(
1
);
}
revisionDateStr
.
replaceAll
(
"\\r"
,
""
);
String
strDate
=
""
;
try
{
if
(
revisionDateStr
.
contains
(
"Z"
))
{
strDate
=
revisionDateStr
;
}
else
{
Date
date
=
format2
.
parse
(
revisionDateStr
);
strDate
=
utcFormat
.
format
(
date
);
}
}
catch
(
ParseException
e
)
{
}
memory
.
revisionDate
.
put
(
fileId
,
strDate
);
}
private
String
extractTextFromXML
(
String
fileId
,
String
xmlText
,
Memory
memory
)
{
Matcher
matcher
=
this
.
pattern
.
matcher
(
xmlText
);
...
...
src/main/java/fr/limsi/sourceExtractor/training/AProcess.java
View file @
42a0c05a
...
...
@@ -962,13 +962,14 @@ public abstract class AProcess extends AProcessSupport {
private
StringBuilder
brat2JSON
(
String
fileId
,
HashMap
<
String
,
SourceAnnotation
>
annotations
)
{
String
docText
=
this
.
memory
.
docTexts
.
get
(
fileId
);
String
revisionDateStr
=
this
.
memory
.
revisionDate
.
get
(
fileId
);
ArrayList
<
Integer
>
sentenceOffsets
=
this
.
memory
.
sentenceOffsetsByFile
.
get
(
fileId
);
TreeSet
<
SourceAnnotation
>
orderedAnnotations
=
new
TreeSet
<>();
orderedAnnotations
.
addAll
(
annotations
.
values
());
StringBuilder
jsonResult
=
new
StringBuilder
();
jsonResult
.
append
(
"{\"identifier\":\""
+
fileId
+
"\",\n"
);
jsonResult
.
append
(
"\"revisionDate\":\""
+
revisionDateStr
+
"\",\n"
);
jsonResult
.
append
(
"\"source_sentences\":[\n"
);
int
annIndex
=
0
;
ArrayList
<
SourceAnnotation
>
sentenceAnnotations
=
new
ArrayList
<>();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment