Commit 92406f63 authored by Thibault Ehrhart's avatar Thibault Ehrhart

Refactor clusterize.sh script to handle languages separetly

parent 0d3425ab
......@@ -14,7 +14,10 @@
# - Need to add ability to pass the path to lst (and maybe other paths that are required?) to the .sh script
#- Parse results from .clust XML file and convert its into Turtle
# Convert a formatted date string to an unix timestamp
# $1 YYYY-MM-DD date string
date_to_epoch() {
local date_ymd
date_ymd="${1}"
if [[ "$OSTYPE" == "darwin"* ]]; then
echo $(date -j -f "%F" "${date_ymd}" +"%s")
......@@ -23,9 +26,12 @@ date_to_epoch() {
fi
}
# Get the last day of the month
# $1 year
# $2 month
last_day_of_month() {
date_y="${1}"
date_m="${2}"
local date_y="${1}"
local date_m="${2}"
if [[ "$OSTYPE" == "darwin"* ]]; then
echo $(date -j -f "%F" "${date_y}/${date_m}/1 + 1 month - 1 day" +"%d")
else
......@@ -33,35 +39,63 @@ last_day_of_month() {
fi
}
cd "$(dirname "${BASH_SOURCE[0]}")" || exit
CWD="${PWD}"
# Return the absolute path to a file
# $1 path
realpath() {
echo $(cd $(dirname $1); pwd)/$(basename $1);
}
mkdir -p "../data/resource/clustering/xml"
mkdir -p "../data/resource/clustering/txt"
mkdir -p "../data/resource/clustering/json"
mkdir -p "../data/resource/clustering/lst/corpus/json"
mkdir -p "../data/resource/clustering/lst/corpus/xml"
mkdir -p "../data/resource/clustering/out"
# Clusterize
# $1 lang_dir (eg. "/path/to/data/ENG")
# $2 data_dir (eg. "../data/resource/clustering")
# $3 past_epoch (eg. "1594804209")
clusterize() {
local lang_dir="${1}"
local data_dir=$(realpath "${2}")
local past_epoch="${3}"
local xml_dir="${data_dir}/xml"
local txt_dir="${data_dir}/txt"
local json_dir="${data_dir}/json"
local lst_dir="${data_dir}/lst/corpus"
local out_dir="${data_dir}/out"
local dir_date
local dir_epoch
local year_dir
local year_name
local month_dir
local month_name
local day_dir
local day_name
local xml_count=0
if [[ ! -d "${data_dir}" ]]; then
echo "${data_dir}: Directory does not exist."
return 1
fi
find "../data/resource/clustering/xml" -type f -name "*.xml" -delete
find "../data/resource/clustering/txt" -type f -name "*.txt" -delete
find "../data/resource/clustering/json" -type f -name "*.json" -delete
find "../data/resource/clustering/lst/corpus/json" -type f -name "*.lst" -delete
find "../data/resource/clustering/lst/corpus/xml" -type f -name "*.lst" -delete
find "../data/resource/clustering/out" -name "*.*" -delete
### Step 0
### Step 1
echo "Cleaning up previous clusterization..."
echo "Copying XML files..."
mkdir -p "${xml_dir}"
mkdir -p "${txt_dir}"
mkdir -p "${json_dir}"
mkdir -p "${lst_dir}/json"
mkdir -p "${lst_dir}/xml"
mkdir -p "${out_dir}"
current_date=$(date +'%Y-%m-%d')
current_epoch=$(date_to_epoch "${current_date}")
days=14
seconds=$(($days*24*60*60))
past_epoch=$(($current_epoch-seconds))
xml_count=0
find "${xml_dir}" -type f -name "*.xml" -delete
find "${txt_dir}" -type f -name "*.txt" -delete
find "${lst_dir}/json" -type f -name "*.lst" -delete
find "${lst_dir}/xml" -type f -name "*.lst" -delete
rm -rf "${out_dir}"
for lang_dir in "../data/resource/agencefrancepresse/"*; do
### Step 1
echo "Copying XML files..."
# Copy XML files
if [[ -d $lang_dir ]]; then
lang_dir=${lang_dir%/}
for year_dir in "${lang_dir}"/*; do
......@@ -87,7 +121,7 @@ for lang_dir in "../data/resource/agencefrancepresse/"*; do
dir_epoch=$(date_to_epoch "${dir_date}")
if [[ $dir_epoch > $past_epoch ]]; then
for f in "${day_dir}"/*.xml; do
cp "${f}" "../data/resource/clustering/xml/"
cp "${f}" "${xml_dir}/"
xml_count=$(($xml_count+1))
done
fi
......@@ -100,39 +134,80 @@ for lang_dir in "../data/resource/agencefrancepresse/"*; do
fi
done
fi
done
echo "Copied ${xml_count} XML files into ../data/resource/clustering/xml/"
echo "Copied ${xml_count} XML files into ${xml_dir}"
### Step 2
if [ "${xml_count}" -eq "0" ]; then
echo "No documents found, exiting..."
return 0
fi
### Step 2
echo "Extracting corpus from XML..."
(cd "../../afp_clustering" && pipenv run python "src/extract_corpus_from_xml.py" "${CWD}/../data/resource/clustering/xml" "${CWD}/../data/resource/clustering/txt")
echo "Extracting corpus from XML..."
(cd "${CWD}/../../afp_clustering" && python "src/extract_corpus_from_xml.py" "${xml_dir}" "${txt_dir}")
### Step 3
### Step 3
echo "Parsing corpus..."
(cd "../../afp_clustering" && pipenv run python "src/StanfordNLPCustom/corpusParser_parallel.py" "${CWD}/../data/resource/clustering/txt" "${CWD}/../data/resource/clustering/json")
echo "Parsing corpus..."
(cd "${CWD}/../../afp_clustering" && python "src/StanfordNLPCustom/corpusParser_parallel.py" "${txt_dir}" "${json_dir}")
### Step 4
### Step 4
echo "Generating listings..."
# The directory clustering/json/* acts as a cache which is never deleted (because its generation is very slow)
# and it might contains all previous JSON files, so we need a loop to only get the JSON files that match the XML files
echo "Generation XML and JSON listing..."
for i in "${xml_dir}/"*.xml; do
xml_filename="$(basename "$i")"
if [ -f "${json_dir}/${xml_filename%.*}.txt.json" ]; then
echo "${json_dir}/${xml_filename%.*}.txt.json" >> "${lst_dir}/json/corpus.lst"
echo "$i" >> "${lst_dir}/xml/corpus.lst"
else
echo "${xml_filename%.*}.txt.json: File does not exist. Ignoring the XML version to prevent errors."
fi
done
find "$(cd "${CWD}/../data/resource/clustering/xml" || exit; pwd)" -name "*.xml" > "${CWD}/../data/resource/clustering/lst/corpus/xml/corpus.lst"
find "$(cd "${CWD}/../data/resource/clustering/json" || exit; pwd)" -name "*.json" > "${CWD}/../data/resource/clustering/lst/corpus/json/corpus.lst"
### Step 5
### Step 5
echo "Executing afp_clustering_json_parallel_grid"
(\
cd "${CWD}/../../afp_clustering" && \
LST_DIR="${lst_dir}" \
OUT_BASE="${out_dir}" \
pipenv run bash ./afp_clustering_json_parallel_grid.sh \
)
echo "Executing afp_clustering_json_parallel_grid"
(\
cd "../../afp_clustering" && \
LST_DIR="${CWD}/../data/resource/clustering/lst/corpus" \
OUT_BASE="${CWD}/../data/resource/clustering/out" \
pipenv run bash ./afp_clustering_json_parallel_grid.sh \
)
### Step 6
### Step 6
echo "Converting clusters to RDF"
cd "${CWD}/.." || return
java -Dlogfile.name=asrael-clusters-converter.log -Dlogfile.append=true -jar clusters-converter-1.0-SNAPSHOT.jar
}
echo "Converting clusters to RDF"
cd "${CWD}/.." || exit 1
java -Dlogfile.name=asrael-clusters-converter.log -Dlogfile.append=true -jar clusters-converter-1.0-SNAPSHOT.jar
# Activate conda environment
if command -v conda &> /dev/null ; then
echo "Activating conda environment..."
eval "$(conda shell.bash hook)"
conda activate asrael-clustering || exit
fi
cd "$(dirname "${BASH_SOURCE[0]}")" || exit
CWD="${PWD}"
echo "CWD: ${CWD}"
current_date=$(date +'%Y-%m-%d')
current_epoch=$(date_to_epoch "${current_date}")
days=${DAYS:-14}
seconds=$(($days*24*60*60))
past_epoch=$(($current_epoch-seconds))
echo "Days: ${days}"
for lang_dir in "../data/resource/agencefrancepresse/"*; do
if [[ -d "${lang_dir}" ]]; then
echo "clusterize: ${lang_dir}"
clusterize "$lang_dir" "../data/resource/clustering" "$past_epoch"
else
echo "${lang_dir}: Not a directory. It has been ignored."
fi
done
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment