Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ Returns an object containing the following properties:
* `siteName`: name of the site;
* `lang`: content language;
* `publishedTime`: published time;
* `modifiedTime`: modified time;
* `favicon`: site favicon as URI, SVG format if available;
* `image`: URI to article image, extracted from metadata;

The `parse()` method works by modifying the DOM. This removes some elements in the web page, which may be undesirable. You can avoid this by passing the clone of the `document` object to the `Readability` constructor:

Expand Down
87 changes: 82 additions & 5 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -1748,6 +1748,9 @@ Readability.prototype = {
if (typeof parsed.datePublished === "string") {
metadata.datePublished = parsed.datePublished.trim();
}
if (typeof parsed.dateModified === "string") {
metadata.dateModified = parsed.dateModified.trim();
}
} catch (err) {
this.log(err.message);
}
Expand All @@ -1771,11 +1774,11 @@ Readability.prototype = {

// property is a space-separated list of values
var propertyPattern =
/\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
/\s*(article|dc|dcterms|og|twitter)\s*:\s*(author|creator|description|image:alt|image|published_time|modified|title|site_name)\s*/gi;

// name is a single value
var namePattern =
/^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
/^\s*(?:(dc|dcterms|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;

// Find description tags.
this._forEachNode(metaElements, function (element) {
Expand Down Expand Up @@ -1813,7 +1816,7 @@ Readability.prototype = {
metadata.title =
jsonld.title ||
values["dc:title"] ||
values["dcterm:title"] ||
values["dcterms:title"] ||
values["og:title"] ||
values["weibo:article:title"] ||
values["weibo:webpage:title"] ||
Expand All @@ -1835,7 +1838,7 @@ Readability.prototype = {
metadata.byline =
jsonld.byline ||
values["dc:creator"] ||
values["dcterm:creator"] ||
values["dcterms:creator"] ||
values.author ||
values["parsely-author"] ||
articleAuthor;
Expand All @@ -1844,7 +1847,7 @@ Readability.prototype = {
metadata.excerpt =
jsonld.excerpt ||
values["dc:description"] ||
values["dcterm:description"] ||
values["dcterms:description"] ||
values["og:description"] ||
values["weibo:article:description"] ||
values["weibo:webpage:description"] ||
Expand All @@ -1854,24 +1857,95 @@ Readability.prototype = {
// get site name
metadata.siteName = jsonld.siteName || values["og:site_name"];

// get image thumbnail
metadata.image = values["og:image"] || values.image || values["twitter:image"];

// get favicon
metadata.favicon = this._getArticleFavicon()

// get article published time
metadata.publishedTime =
jsonld.datePublished ||
values["article:published_time"] ||
values["parsely-pub-date"] ||
null;

// get modified date
metadata.modifiedTime =
jsonld.dateModified ||
values["article:modified_time"] ||
values["dcterms:modified"] ||
null;

// in many sites the meta value is escaped with HTML entities,
// so here we need to unescape it
metadata.title = this._unescapeHtmlEntities(metadata.title);
metadata.byline = this._unescapeHtmlEntities(metadata.byline);
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
metadata.modifiedTime = this._unescapeHtmlEntities(metadata.modifiedTime);

return metadata;
},

/**
* Trying to extract the favicon from the page
**/
_getArticleFavicon() {

// string to return
var favicon = "";

// find all ink tags
var metaElements = this._doc.getElementsByTagName("link");

// iterate over tags.
this._forEachNode(metaElements, function (element) {

// make sure the type is correct and element contains a href attribute
var rel = element.hasAttribute("rel") ? element.getAttribute("rel") : "";
if (rel === "icon" && element.hasAttribute("href")) {
favicon = element.getAttribute("href");

var type = element.hasAttribute("type") ? element.getAttribute("type") : "";
if(type === "image/svg+xml")
{
// svg wins as best quality format
return this._toAbsoluteURI(favicon);
Comment on lines +1895 to +1896
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This returns from the inner function but not the outer one, so I don't think this works, unfortunately?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is still broken, unfortunately? The code wants to use the return this._toAbsoluteURII(favicon) as the return value for _getArticleFavicon. But this is inside the (function) callback provided to this._forEachNode, and the return value is not used by _forEachNode, see

Array.prototype.forEach.call(nodeList, fn, this);
. So it's just going to be dropped on the floor.

To make this work, switch from using _forEachNode to using a manual loop, for (var i = 0; i < metaElements.length; i++) etc. - this is fine in this loop as we're not modifying the DOM while we're looping over these elements.

}

// what is missing here is an algorithm which compares all href and selects the "best" size
}
});

// make sure to return an absolute URI
return this._toAbsoluteURI(favicon);
},

/**
* Convert a relative to an absolute URI
*
* @param {string} uri
**/
_toAbsoluteURI(uri) {

// stop processing if uri is empty
if(uri === ""){
return uri;
}

// try to parse into URL object
var absolute_uri = URL.parse(uri, this._doc.baseURI);
if(!absolute_uri){
// parsing failed, return original URI
return uri;
}

// parsing worked, return absolute URI
return absolute_uri.href;
},

/**
* Check if node is image, or if node contains exactly only one image
* whether as a direct child or as its descendants.
Expand Down Expand Up @@ -2784,7 +2858,10 @@ Readability.prototype = {
length: textContent.length,
excerpt: metadata.excerpt,
siteName: metadata.siteName || this._articleSiteName,
image: metadata.image,
favicon: metadata.favicon,
publishedTime: metadata.publishedTime,
modifiedTime: metadata.modifiedTime,
};
},
};
Expand Down