NewsCollector: collect media images for AFP

- Add new datatype ImageObject
- Collect media URLs of AFP press articles using the AFP-4W API
- Add rnews:associatedMedia property to NewsArticle datatype
parent d89766aa
package fr.eurecom.asrael.newscollector.agencefrancepresse;
import fr.eurecom.asrael.newscollector.datatypes.ImageObject;
import fr.eurecom.asrael.newscollector.datatypes.NewsArticle;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
......@@ -15,7 +16,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.*;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
......@@ -57,7 +57,12 @@ public class AgenceFrancePresse {
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
try (InputStream in = new FileInputStream(xml.toFile())) {
try {
newsArticles.add(AgenceFrancePresseMethods.parseArticle(inputFactory.createXMLStreamReader(in)));
NewsArticle article = AgenceFrancePresseMethods.parseArticle(inputFactory.createXMLStreamReader(in));
List<ImageObject> images = AgenceFrancePresseMethods.getArticleImages(article);
images.forEach(imageObject -> {
article.addAssociatedMedia(imageObject);
});
newsArticles.add(article);
} catch (XMLStreamException e) {
AgenceFrancePresse.LOGGER.error(e.toString());
}
......
package fr.eurecom.asrael.newscollector.agencefrancepresse;
import fr.eurecom.asrael.newscollector.datatypes.ImageObject;
import fr.eurecom.asrael.newscollector.datatypes.NewsArticle;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.time.LocalDateTime;
import java.time.Month;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.List;
final class AgenceFrancePresseMethods {
private static final Logger LOGGER = LoggerFactory.getLogger(AgenceFrancePresseMethods.class);
......@@ -133,6 +150,166 @@ final class AgenceFrancePresseMethods {
return news;
}
public static List<ImageObject> getArticleImages(NewsArticle article) {
AgenceFrancePresseMethods.LOGGER.info("Querying Media API for article " + article.getId());
if (article.getDatePublished().isEqual(LocalDateTime.of(0000, Month.JANUARY, 01, 00, 00, 00))) {
AgenceFrancePresseMethods.LOGGER.info("getArticleImages: Article " + article.getId() + " has no published date");
return null;
}
if (article.getSubjects().isEmpty()) {
AgenceFrancePresseMethods.LOGGER.info("getArticleImages: Article " + article.getId() + " has no subjects");
return null;
}
if (article.getDateline().isEmpty()) {
AgenceFrancePresseMethods.LOGGER.info("getArticleImages: Article " + article.getId() + " has no dateline");
return null;
}
List<ImageObject> images = new ArrayList<>();
final String apiUrl = "http://medialab.afp.com/afp-4w";
URIBuilder uriBuilder = null;
final HttpGet req;
try {
String subjectCode = article.getSubjects().iterator().next();
if (subjectCode.startsWith("http://cv.iptc.org/newscodes/subjectcode/")) {
subjectCode = subjectCode.substring("http://cv.iptc.org/newscodes/subjectcode/".length());
}
uriBuilder = new URIBuilder(apiUrl);
uriBuilder
.setParameter("when", article.getDatePublished().format(DateTimeFormatter.ofPattern("YYYY-MM-dd")))
.setParameter("what_iptc", subjectCode)
.setParameter("where", article.getDateline())
.setParameter("lang", article.getLanguage())
.setParameter("format", "json");
req = new HttpGet(uriBuilder.build());
final HttpClient httpClient = HttpClientBuilder.create().build();
final HttpResponse res = httpClient.execute(req);
JSONObject data = (JSONObject) JSONValue.parse(new InputStreamReader(res.getEntity().getContent(), "UTF-8"));
EntityUtils.consume(res.getEntity());
if (data != null && data.get("response") != null) {
final JSONObject response = (JSONObject) data.get("response");
if (response.get("docs") != null) {
final JSONArray docs = (JSONArray) response.get("docs");
if (!docs.isEmpty()) {
final JSONObject doc = (JSONObject) docs.get(0);
if (doc != null) {
final String serial = String.valueOf(doc.get("serial"));
if (serial != null && !serial.isEmpty()) {
// Get the list of images for this article
images = downloadSerial(article, serial);
}
}
}
}
}
} catch (URISyntaxException | IOException e) {
AgenceFrancePresseMethods.LOGGER.error(e.toString());
return null;
}
return images;
}
public static List<ImageObject> downloadSerial(final NewsArticle article, final String serial) {
AgenceFrancePresseMethods.LOGGER.info("Downloading serial " + serial);
final String url = "http://medialab.afp.com" + serial;
List<ImageObject> images = new ArrayList<>();
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
try (InputStream in = new URL(url).openStream()) {
XMLStreamReader streamReader = inputFactory.createXMLStreamReader(in);
boolean inNewsComponent = false;
boolean inContentItem = false;
String mediaType = "";
String caption = "";
ImageObject tmpImage = null;
String role = "";
while (streamReader.hasNext()) {
if (streamReader.isStartElement()) {
switch (streamReader.getLocalName()) {
case "MediaType":
if (inContentItem) {
mediaType = streamReader.getAttributeValue(null, "FormalName");
}
break;
case "Property":
if (inContentItem) {
final String value = streamReader.getAttributeValue(null, "Value");
switch (streamReader.getAttributeValue(null, "FormalName")) {
case "Width":
tmpImage.setWidth(Integer.parseInt(value));
break;
case "Height":
tmpImage.setHeight(Integer.parseInt(value));
break;
}
}
break;
case "Role":
if (inNewsComponent) {
role = streamReader.getAttributeValue(null, "FormalName");
}
break;
case "HeadLine":
if (inNewsComponent) {
caption = streamReader.getElementText();
}
break;
case "NewsComponent":
inNewsComponent = true;
break;
case "ContentItem":
if (inNewsComponent && role.equals("HighDef")) {
final String href = streamReader.getAttributeValue(null, "Href");
if (href != null && !href.isEmpty()) {
tmpImage = new ImageObject();
tmpImage.addDescription(article.getLanguage(), caption);
tmpImage.setUrl("http://medialab.afp.com" + href);
inContentItem = true;
}
}
break;
}
}
if (streamReader.isEndElement()) {
switch (streamReader.getLocalName()) {
case "NewsComponent":
role = "";
caption = "";
inNewsComponent = false;
break;
case "ContentItem":
if (mediaType.equals("Photo") && role.equals("HighDef") && !tmpImage.getUrl().isEmpty()) {
images.add(tmpImage);
}
mediaType = "";
inContentItem = false;
break;
}
}
streamReader.next();
}
} catch (Exception e) {
e.printStackTrace();
}
return images;
}
public static LocalDateTime parseDate(final String dateStr) {
// Formats known:
// 20110725T005637Z
......@@ -150,4 +327,5 @@ final class AgenceFrancePresseMethods {
}
return dt;
}
}
package fr.eurecom.asrael.newscollector.datatypes;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.ResourceFactory;
import org.apache.jena.vocabulary.RDF;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
public class ImageObject {
private String url;
private int width;
private int height;
private String copyrightHolder;
private String sourceOrganization;
private final Map<String, String> descriptions;
/**
* Default constructor.
*/
public ImageObject() {
this.url = "";
this.width = 0;
this.height = 0;
this.copyrightHolder = "";
this.sourceOrganization = "";
this.descriptions = new HashMap<>();
}
public final String getUrl() {
return url;
}
public final void setUrl(final String url) {
this.url = url;
}
public final int getWidth() {
return width;
}
public final void setWidth(final int width) {
this.width = width;
}
public final int getHeight() {
return height;
}
public final void setHeight(final int height) {
this.height = height;
}
public final String getCopyrightHolder() {
return copyrightHolder;
}
public final void setCopyrightHolder(final String copyrightHolder) {
this.copyrightHolder = copyrightHolder;
}
public final String getSourceOrganization() {
return sourceOrganization;
}
public final void setSourceOrganization(final String sourceOrganization) {
this.sourceOrganization = sourceOrganization;
}
public Map<String, String> getDescriptions() {
return Collections.unmodifiableMap(descriptions);
}
public final void setDescription(final Map<String, String> descriptions) {
this.descriptions.clear();
this.descriptions.putAll(descriptions);
}
public void addDescription(String lang, String description) {
if (this.descriptions.containsKey(lang)) {
final String tmp = this.descriptions.get(lang);
this.descriptions.remove(lang);
this.descriptions.put(lang, tmp + ' ' + lang.trim());
} else {
this.descriptions.put(lang, description.trim());
}
}
/**
* Create the RDF model of the current news.
*
* @return A RDF model corresponding to a news
*/
public final Model getRDFModel() {
final Model model = ModelFactory.createDefaultModel();
final Map<String, String> prefixes = new HashMap<>();
prefixes.put("rdf", RDF.getURI());
prefixes.put("dc", "http://purl.org/dc/elements/1.1/");
prefixes.put("schema", "http://schema.org/");
prefixes.put("xsd", "http://www.w3.org/2001/XMLSchema#");
prefixes.put("rnews", "http://iptc.org/std/rNews/2011-10-07#");
model.setNsPrefixes(prefixes);
if (this.url.isEmpty()) {
return model;
}
model.add(ResourceFactory.createResource(this.url), RDF.type,
ResourceFactory.createResource("http://iptc.org/std/rNews/2011-10-07#ImageObject"));
// Width
if (this.width > 0) {
model.add(ResourceFactory.createResource(this.url),
ResourceFactory.createProperty("http://iptc.org/std/rNews/2011-10-07#width"),
ResourceFactory.createPlainLiteral(String.valueOf(this.width)));
}
// Height
if (this.height > 0) {
model.add(ResourceFactory.createResource(this.url),
ResourceFactory.createProperty("http://iptc.org/std/rNews/2011-10-07#height"),
ResourceFactory.createPlainLiteral(String.valueOf(this.height)));
}
// Copyright holder
if (!this.copyrightHolder.isEmpty()) {
model.add(ResourceFactory.createResource(this.url),
ResourceFactory.createProperty("http://iptc.org/std/rNews/2011-10-07#copyrightHolder"),
ResourceFactory.createPlainLiteral(this.copyrightHolder));
}
// Source organization
if (!this.sourceOrganization.isEmpty()) {
model.add(ResourceFactory.createResource(this.url),
ResourceFactory.createProperty("http://iptc.org/std/rNews/2011-10-07#sourceOrganization"),
ResourceFactory.createPlainLiteral(this.sourceOrganization));
}
// Description
if (!this.descriptions.isEmpty()) {
for (final Map.Entry<String, String> entry : this.descriptions.entrySet()) {
model.add(ResourceFactory.createResource(this.url),
ResourceFactory.createProperty("http://iptc.org/std/rNews/2011-10-07#description"),
ResourceFactory.createLangLiteral(entry.getValue(), entry.getKey()));
}
}
return model;
}
@Override
public final boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if ((obj == null) || (this.getClass() != obj.getClass())) {
return false;
}
final ImageObject image = (ImageObject) obj;
if (!this.url.equals(image.url)) {
return false;
}
if (this.width != image.width) {
return false;
}
if (this.height != image.height) {
return false;
}
if (!this.copyrightHolder.equals(image.copyrightHolder)) {
return false;
}
if (!this.sourceOrganization.equals(image.sourceOrganization)) {
return false;
}
if (!this.descriptions.equals(image.descriptions)) {
return false;
}
return true;
}
@Override
public final int hashCode() {
int result = this.url.hashCode();
result = 31 * (result + this.width);
result = 31 * (result + this.height);
result = 31 * (result + this.copyrightHolder.hashCode());
result = 31 * (result + this.sourceOrganization.hashCode());
result = 31 * (result + this.descriptions.hashCode());
return result;
}
@Override
public final String toString() {
return "NewsArticle{" +
"url='" + this.url + '\'' +
", width='" + this.width + '\'' +
", height='" + this.height + '\'' +
", copyrightHolder='" + this.copyrightHolder + '\'' +
", sourceOrganization='" + this.sourceOrganization + '\'' +
", description='" + this.descriptions + '\'' +
'}';
}
}
......@@ -33,6 +33,7 @@ public class NewsArticle {
private String country;
private String city;
private Set<String> keywords;
private Set<ImageObject> associatedMedias;
private static final String BASE_URI = "http://asrael.eurecom.fr/news/";
/**
......@@ -53,6 +54,7 @@ public class NewsArticle {
this.country = "";
this.city = "";
this.keywords = new HashSet<>();
this.associatedMedias = new HashSet<>();
}
public final String getId() {
......@@ -195,6 +197,14 @@ public class NewsArticle {
this.keywords.add(keyword);
}
public Set<ImageObject> getAssociatedMedias() {
return this.associatedMedias;
}
public void addAssociatedMedia(ImageObject media) {
this.associatedMedias.add(media);
}
/**
* Create the RDF model of the current news.
......@@ -300,6 +310,17 @@ public class NewsArticle {
ResourceFactory.createPlainLiteral(keyword));
}
// Associated medias
for (final ImageObject media : this.associatedMedias) {
if (!media.getUrl().isEmpty()) {
model.add(media.getRDFModel());
model.add(ResourceFactory.createResource(NewsArticle.BASE_URI +
UUIDGenerator.generateUUID(this.id)),
ResourceFactory.createProperty("http://iptc.org/std/rNews/2011-10-07#associatedMedia"),
ResourceFactory.createResource(media.getUrl()));
}
}
// Genre
if (!this.genre.isEmpty()) {
model.add(ResourceFactory.createResource(NewsArticle.BASE_URI +
......@@ -385,7 +406,10 @@ public class NewsArticle {
if (!this.language.equals(news.language)) {
return false;
}
if (!this.language.equals(news.keywords)) {
if (!this.keywords.equals(news.keywords)) {
return false;
}
if (!this.associatedMedias.equals(news.associatedMedias)) {
return false;
}
......@@ -408,6 +432,7 @@ public class NewsArticle {
result = 31 * (result + this.genre.hashCode());
result = 31 * (result + this.language.hashCode());
result = 31 * (result + this.keywords.hashCode());
result = 31 * (result + this.associatedMedias.hashCode());
return result;
}
......@@ -427,6 +452,7 @@ public class NewsArticle {
", genre='" + this.genre + '\'' +
", language='" + this.language + '\'' +
", keywords='" + this.keywords + '\'' +
", keywords='" + this.associatedMedias + '\'' +
'}';
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment