Commit 1c21d501 authored by Thibault Ehrhart's avatar Thibault Ehrhart

Add scripts to automate classification and clusterization

parent f4cf9a55
#!/usr/bin/env bash
#Cron:
#- Remove xml files from `news-kb/data/resource/classification/xml`
#- Remove out.pkl and out.json from `news-kb/data/resource/classification`
#- Take all XML files from 30 last days since CURRENT_DATE, put them in `news-kb/data/resource/classification/xml`
#- Run `schema_classification/03-apply_schema_classifiers.py`
#- Run `news-kb/scripts/schema_classification_pkl_to_json.py schema_classification/data/out.pkl news-kb/data/resource/classification/
#- Run classification-converter to convert the output to RDF
date_to_epoch() {
date_ymd="${1}"
if [[ "$OSTYPE" == "darwin"* ]]; then
echo $(date -j -f "%F" "${date_ymd}" +"%s")
else
echo $(date -d "${date_ymd}" +%s)
fi
}
last_day_of_month() {
date_y="${1}"
date_m="${2}"
if [[ "$OSTYPE" == "darwin"* ]]; then
echo $(date -j -f "%F" "${date_y}/${date_m}/1 + 1 month - 1 day" +"%d")
else
echo $(date -d "${date_y}/${date_m}/1 + 1 month - 1 day" +%d)
fi
}
cd "$(dirname "${BASH_SOURCE[0]}")"
CWD="${PWD}"
mkdir -p "../data/resource/classification"
mkdir -p "../data/resource/classification/xml"
### Step 1
if [ -z "$SKIP_STEP_1" ]; then
echo "Removing previous XML files from ../data/resource/classification/xml"
find "../data/resource/classification/xml" -type f -name "*.xml" -delete
find "../data/resource/classification" -type f -name "out.pkl" -delete
find "../data/resource/classification" -type f -name "out.json" -delete
echo "Copying XML files..."
current_date=$(date +'%Y-%m-%d')
current_epoch=$(date_to_epoch "${current_date}")
days=90
seconds=$(($days*24*60*60))
past_epoch=$(($current_epoch-seconds))
xml_count=0
for lang_dir in "../data/resource/agencefrancepresse/"*; do
if [[ -d $lang_dir ]]; then
lang_dir=${lang_dir%/}
for year_dir in "${lang_dir}"/*; do
year_dir=${year_dir%/}
if [[ -d $year_dir ]]; then
year_name=$(basename "${year_dir}")
dir_date="${year_name}-12-$(last_day_of_month "${year_name}" "12")"
dir_epoch=$(date_to_epoch "${dir_date}")
if [[ $dir_epoch > $past_epoch ]]; then
for month_dir in "${year_dir}"/*; do
month_dir=${month_dir%/}
if [[ -d $month_dir ]]; then
echo "Analyzing ${month_dir}"
month_name=$(basename "${month_dir}")
dir_date="${year_name}-${month_name}-$(last_day_of_month "${year_name}" "${month_name}")"
dir_epoch=$(date_to_epoch "${dir_date}")
if [[ $dir_epoch > $past_epoch ]]; then
for day_dir in "${month_dir}"/*; do
day_dir=${day_dir%/}
if [[ -d $day_dir ]]; then
day_name=$(basename "${day_dir}")
dir_date="${year_name}-${month_name}-${day_name}"
dir_epoch=$(date_to_epoch "${dir_date}")
if [[ $dir_epoch > $past_epoch ]]; then
for f in "${day_dir}"/*.xml; do
cp "${f}" "../data/resource/classification/xml/"
xml_count=$(($xml_count+1))
done
fi
fi
done
fi
fi
done
fi
fi
done
fi
done
echo "Copied ${xml_count} XML files into ../data/resource/classification/xml/"
else
echo "Skipped Step 1"
fi
### Step 2
if [ -z "$SKIP_STEP_2" ]; then
echo "Running classifier..."
cd "../../schema_classification" || exit 1
eval "$(conda shell.bash hook)" || exit 1
conda activate "asrael" || exit 1
python "03-apply_schema_classifiers.py" || exit 1
else
echo "Skipped Step 2"
fi
### Step 3
if [ -z "$SKIP_STEP_3" ]; then
echo "Converting classifier output to JSON"
cd "${CWD}" || exit 1
cp "../../schema_classification/data/out.pkl" "../data/resource/classification/out.pkl" || exit 1
python "pkl_classification_to_json.py" "../data/resource/classification/out.pkl" || exit 1
else
echo "Skipped Step 3"
fi
### Step 4
if [ -z "$SKIP_STEP_4" ]; then
echo "Converting classification to RDF"
cd "${CWD}/.." || exit 1
java -Dlogfile.name=asrael-classification-converter.log -Dlogfile.append=true -jar classification-converter-1.0-SNAPSHOT.jar
else
echo "Skipped Step 4"
fi
#!/usr/bin/env bash
#Cron:
#- Remove files from `news-kb/data/resource/clustering/xml`
#- Remove files from `news-kb/data/resource/clustering/txt`
#- Remove files from `news-kb/data/resource/clustering/json`
#- Remove files from `news-kb/data/resource/clustering/lst`
#- Take all XML files from 30 last days since CURRENT_DATE, put them in `news-kb/data/resource/clustering/xml`
#- Execute `python src/extract_corpus_from_xml.py news-kb/data/resource/clustering/xml news-kb/data/resource/clustering/txt`
#- Execute `python src/StanfordNLPCustom/corpusParser_parallel.py news-kb/data/resource/clustering/txt news-kb/data/resource/clustering/json`
#- Execute `find "$(cd news-kb/data/resource/clustering/xml; pwd)" -name "*.xml" > news-kb/data/resource/clustering/lst/xml.lst`
#- Execute `find "$(cd news-kb/data/resource/clustering/json; pwd)" -name "*.json" > news-kb/data/resource/clustering/lst/json.lst`
#- Execute `./afp_clustering_json_parallel_grid.sh news-kb/data/resource/clustering/lst`
# - Need to add ability to pass the path to lst (and maybe other paths that are required?) to the .sh script
#- Parse results from .clust XML file and convert its into Turtle
date_to_epoch() {
date_ymd="${1}"
if [[ "$OSTYPE" == "darwin"* ]]; then
echo $(date -j -f "%F" "${date_ymd}" +"%s")
else
echo $(date -d "${date_ymd}" +%s)
fi
}
last_day_of_month() {
date_y="${1}"
date_m="${2}"
if [[ "$OSTYPE" == "darwin"* ]]; then
echo $(date -j -f "%F" "${date_y}/${date_m}/1 + 1 month - 1 day" +"%d")
else
echo $(date -d "${date_y}/${date_m}/1 + 1 month - 1 day" +%d)
fi
}
cd "$(dirname "${BASH_SOURCE[0]}")" || exit
CWD="${PWD}"
mkdir -p "../data/resource/clustering/xml"
mkdir -p "../data/resource/clustering/txt"
mkdir -p "../data/resource/clustering/json"
mkdir -p "../data/resource/clustering/lst/corpus/json"
mkdir -p "../data/resource/clustering/lst/corpus/xml"
mkdir -p "../data/resource/clustering/out"
find "../data/resource/clustering/xml" -type f -name "*.xml" -delete
find "../data/resource/clustering/txt" -type f -name "*.txt" -delete
find "../data/resource/clustering/json" -type f -name "*.json" -delete
find "../data/resource/clustering/lst/corpus/json" -type f -name "*.lst" -delete
find "../data/resource/clustering/lst/corpus/xml" -type f -name "*.lst" -delete
find "../data/resource/clustering/out" -name "*.*" -delete
### Step 1
echo "Copying XML files..."
current_date=$(date +'%Y-%m-%d')
current_epoch=$(date_to_epoch "${current_date}")
days=14
seconds=$(($days*24*60*60))
past_epoch=$(($current_epoch-seconds))
xml_count=0
for lang_dir in "../data/resource/agencefrancepresse/"*; do
if [[ -d $lang_dir ]]; then
lang_dir=${lang_dir%/}
for year_dir in "${lang_dir}"/*; do
year_dir=${year_dir%/}
if [[ -d $year_dir ]]; then
year_name=$(basename "${year_dir}")
dir_date="${year_name}-12-$(last_day_of_month "${year_name}" "12")"
dir_epoch=$(date_to_epoch "${dir_date}")
if [[ $dir_epoch > $past_epoch ]]; then
for month_dir in "${year_dir}"/*; do
month_dir=${month_dir%/}
if [[ -d $month_dir ]]; then
echo "Analyzing ${month_dir}"
month_name=$(basename "${month_dir}")
dir_date="${year_name}-${month_name}-$(last_day_of_month "${year_name}" "${month_name}")"
dir_epoch=$(date_to_epoch "${dir_date}")
if [[ $dir_epoch > $past_epoch ]]; then
for day_dir in "${month_dir}"/*; do
day_dir=${day_dir%/}
if [[ -d $day_dir ]]; then
day_name=$(basename "${day_dir}")
dir_date="${year_name}-${month_name}-${day_name}"
dir_epoch=$(date_to_epoch "${dir_date}")
if [[ $dir_epoch > $past_epoch ]]; then
for f in "${day_dir}"/*.xml; do
cp "${f}" "../data/resource/clustering/xml/"
xml_count=$(($xml_count+1))
done
fi
fi
done
fi
fi
done
fi
fi
done
fi
done
echo "Copied ${xml_count} XML files into ../data/resource/clustering/xml/"
### Step 2
echo "Extracting corpus from XML..."
(cd "../../afp_clustering" && pipenv run python "src/extract_corpus_from_xml.py" "${CWD}/../data/resource/clustering/xml" "${CWD}/../data/resource/clustering/txt")
### Step 3
echo "Parsing corpus..."
(cd "../../afp_clustering" && pipenv run python "src/StanfordNLPCustom/corpusParser_parallel.py" "${CWD}/../data/resource/clustering/txt" "${CWD}/../data/resource/clustering/json")
### Step 4
echo "Generating listings..."
find "$(cd "${CWD}/../data/resource/clustering/xml" || exit; pwd)" -name "*.xml" > "${CWD}/../data/resource/clustering/lst/corpus/xml/corpus.lst"
find "$(cd "${CWD}/../data/resource/clustering/json" || exit; pwd)" -name "*.json" > "${CWD}/../data/resource/clustering/lst/corpus/json/corpus.lst"
### Step 5
echo "Executing afp_clustering_json_parallel_grid"
(\
cd "../../afp_clustering" && \
LST_DIR="${CWD}/../data/resource/clustering/lst/corpus" \
OUT_BASE="${CWD}/../data/resource/clustering/out" \
pipenv run bash ./afp_clustering_json_parallel_grid.sh \
)
#!/usr/bin/env python
# coding: utf-8
'''
Convert a schema classification pkl file into json file
'''
import sys
import os
import _pickle as pickle
import json
def convert_dict_to_json(file_path):
with open(file_path, 'rb') as fpkl, open('%s.json' % file_path, 'w') as fjson:
data = pickle.load(fpkl)
output = dict()
for schema_name, items in data.items():
if schema_name != 'text':
schema_id = schema_name.split(' - ')[0].strip()
for xml_filename, has_class in items.items():
if has_class == 1:
if xml_filename not in output:
output[xml_filename] = list()
output[xml_filename].append(schema_id)
json.dump(output, fjson)
def main():
if sys.argv[1] and os.path.isfile(sys.argv[1]):
file_path = sys.argv[1]
print("Processing %s ..." % file_path)
convert_dict_to_json(file_path)
else:
print("Usage: %s abs_file_path" % (__file__))
if __name__ == '__main__':
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment