-
Notifications
You must be signed in to change notification settings - Fork 705
Extract image URL #975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Extract image URL #975
Changes from 4 commits
47f4b3a
65ad660
5936d88
27a90dc
355d694
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -1748,6 +1748,9 @@ Readability.prototype = { | |||
| if (typeof parsed.datePublished === "string") { | ||||
| metadata.datePublished = parsed.datePublished.trim(); | ||||
| } | ||||
| if (typeof parsed.dateModified === "string") { | ||||
| metadata.dateModified = parsed.dateModified.trim(); | ||||
| } | ||||
| } catch (err) { | ||||
| this.log(err.message); | ||||
| } | ||||
|
|
@@ -1771,11 +1774,11 @@ Readability.prototype = { | |||
|
|
||||
| // property is a space-separated list of values | ||||
| var propertyPattern = | ||||
| /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi; | ||||
| /\s*(article|dc|dcterms|og|twitter)\s*:\s*(author|creator|description|image:alt|image|published_time|modified|title|site_name)\s*/gi; | ||||
|
|
||||
| // name is a single value | ||||
| var namePattern = | ||||
| /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; | ||||
| /^\s*(?:(dc|dcterms|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; | ||||
|
|
||||
| // Find description tags. | ||||
| this._forEachNode(metaElements, function (element) { | ||||
|
|
@@ -1813,7 +1816,7 @@ Readability.prototype = { | |||
| metadata.title = | ||||
| jsonld.title || | ||||
| values["dc:title"] || | ||||
| values["dcterm:title"] || | ||||
| values["dcterms:title"] || | ||||
| values["og:title"] || | ||||
| values["weibo:article:title"] || | ||||
| values["weibo:webpage:title"] || | ||||
|
|
@@ -1835,7 +1838,7 @@ Readability.prototype = { | |||
| metadata.byline = | ||||
| jsonld.byline || | ||||
| values["dc:creator"] || | ||||
| values["dcterm:creator"] || | ||||
| values["dcterms:creator"] || | ||||
| values.author || | ||||
| values["parsely-author"] || | ||||
| articleAuthor; | ||||
|
|
@@ -1844,7 +1847,7 @@ Readability.prototype = { | |||
| metadata.excerpt = | ||||
| jsonld.excerpt || | ||||
| values["dc:description"] || | ||||
| values["dcterm:description"] || | ||||
| values["dcterms:description"] || | ||||
| values["og:description"] || | ||||
| values["weibo:article:description"] || | ||||
| values["weibo:webpage:description"] || | ||||
|
|
@@ -1854,24 +1857,95 @@ Readability.prototype = { | |||
| // get site name | ||||
| metadata.siteName = jsonld.siteName || values["og:site_name"]; | ||||
|
|
||||
| // get image thumbnail | ||||
| metadata.image = values["og:image"] || values.image || values["twitter:image"]; | ||||
|
|
||||
| // get favicon | ||||
| metadata.favicon = this._getArticleFavicon() | ||||
|
|
||||
| // get article published time | ||||
| metadata.publishedTime = | ||||
| jsonld.datePublished || | ||||
| values["article:published_time"] || | ||||
| values["parsely-pub-date"] || | ||||
| null; | ||||
|
|
||||
| // get modified date | ||||
| metadata.modifiedTime = | ||||
| jsonld.dateModified || | ||||
| values["article:modified_time"] || | ||||
| values["dcterms:modified"] || | ||||
| null; | ||||
|
|
||||
| // in many sites the meta value is escaped with HTML entities, | ||||
| // so here we need to unescape it | ||||
| metadata.title = this._unescapeHtmlEntities(metadata.title); | ||||
| metadata.byline = this._unescapeHtmlEntities(metadata.byline); | ||||
| metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); | ||||
| metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); | ||||
| metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); | ||||
| metadata.modifiedTime = this._unescapeHtmlEntities(metadata.modifiedTime); | ||||
|
|
||||
| return metadata; | ||||
| }, | ||||
|
|
||||
| /** | ||||
| * Trying to extract the favicon from the page | ||||
| **/ | ||||
| _getArticleFavicon() { | ||||
|
|
||||
| // string to return | ||||
| var favicon = ""; | ||||
|
|
||||
| // find all ink tags | ||||
| var metaElements = this._doc.getElementsByTagName("link"); | ||||
|
|
||||
| // iterate over tags. | ||||
| this._forEachNode(metaElements, function (element) { | ||||
|
|
||||
| // make sure the type is correct and element contains a href attribute | ||||
| var rel = element.hasAttribute("rel") ? element.getAttribute("rel") : ""; | ||||
| if (rel === "icon" && element.hasAttribute("href")) { | ||||
| favicon = element.getAttribute("href"); | ||||
|
|
||||
| var type = element.hasAttribute("type") ? element.getAttribute("type") : ""; | ||||
| if(type === "image/svg+xml") | ||||
| { | ||||
| // svg wins as best quality format | ||||
| return this._toAbsoluteURI(favicon); | ||||
|
Comment on lines
+1895
to
+1896
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This returns from the inner function but not the outer one, so I don't think this works, unfortunately?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is still broken, unfortunately? The code wants to use the Line 349 in 1f0ec42
To make this work, switch from using |
||||
| } | ||||
|
|
||||
| // what is missing here is an algorithm which compares all href and selects the "best" size | ||||
| } | ||||
| }); | ||||
|
|
||||
| // make sure to return an absolute URI | ||||
| return this._toAbsoluteURI(favicon); | ||||
| }, | ||||
|
|
||||
| /** | ||||
| * Convert a relative to an absolute URI | ||||
| * | ||||
| * @param {string} uri | ||||
| **/ | ||||
| _toAbsoluteURI(uri) { | ||||
|
|
||||
| // stop processing if uri is empty | ||||
| if(uri === ""){ | ||||
| return uri; | ||||
| } | ||||
|
|
||||
| // try to parse into URL object | ||||
| var absolute_uri = URL.parse(uri, this._doc.baseURI); | ||||
LordEidi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||
| if(!absolute_uri){ | ||||
| // parsing failed, return original URI | ||||
| return uri; | ||||
| } | ||||
|
|
||||
| // parsing worked, return absolute URI | ||||
| return absolute_uri.href; | ||||
| }, | ||||
|
|
||||
| /** | ||||
| * Check if node is image, or if node contains exactly only one image | ||||
| * whether as a direct child or as its descendants. | ||||
|
|
@@ -2784,7 +2858,10 @@ Readability.prototype = { | |||
| length: textContent.length, | ||||
| excerpt: metadata.excerpt, | ||||
| siteName: metadata.siteName || this._articleSiteName, | ||||
| image: metadata.image, | ||||
| favicon: metadata.favicon, | ||||
| publishedTime: metadata.publishedTime, | ||||
| modifiedTime: metadata.modifiedTime, | ||||
| }; | ||||
| }, | ||||
| }; | ||||
|
|
||||
Uh oh!
There was an error while loading. Please reload this page.