release: 2.1.0 (#373)

adampash · web-flow · commit ca47f9c7a7ac · 2019-04-10T08:42:10.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,52 @@
 # Mercury Parser Changelog
 
+### 2.1.0 (Apr 10, 2019)
+
+##### Commits
+
+- [[`3614e31abc`](https://github.com/postlight/mercury-parser/commit/3614e31abc)] - **fix**: skip absolutizing empty hrefs (#372) (Toufic Mouallem)
+- [[`73be0c5a10`](https://github.com/postlight/mercury-parser/commit/73be0c5a10)] - **feat**: add www.jnsa.org custom parser (#346) (kik0220)
+- [[`eacd1ee97f`](https://github.com/postlight/mercury-parser/commit/eacd1ee97f)] - **feat**: custom genius parser. (#284) (Adam Pash)
+- [[`c389c966d7`](https://github.com/postlight/mercury-parser/commit/c389c966d7)] - **feat**: add jvndb.jvn.jp custom parser (#345) (kik0220)
+- [[`8493d05cb5`](https://github.com/postlight/mercury-parser/commit/8493d05cb5)] - **feat**: add scan.netsecurity.ne.jp custom parser (#347) (kik0220)
+- [[`2a76c6c212`](https://github.com/postlight/mercury-parser/commit/2a76c6c212)] - **feat**: add www.elecom.co.jp custom parser (#348) (kik0220)
+- [[`a9e010b718`](https://github.com/postlight/mercury-parser/commit/a9e010b718)] - **feat**: add www.sanwa.co.jp custom parser (#349) (kik0220)
+- [[`1639eae324`](https://github.com/postlight/mercury-parser/commit/1639eae324)] - **feat**: add www.asahi.com custom parser (#350) (kik0220)
+- [[`21f7de70c1`](https://github.com/postlight/mercury-parser/commit/21f7de70c1)] - **feat**: add buzzap.jp custom parser (#351) (kik0220)
+- [[`f3a7e393a3`](https://github.com/postlight/mercury-parser/commit/f3a7e393a3)] - **feat**: add www.ossnews.jp custom parser (#352) (kik0220)
+- [[`c309bdb373`](https://github.com/postlight/mercury-parser/commit/c309bdb373)] - **feat**: add otrs.com custom parser (#353) (kik0220)
+- [[`71c4d05037`](https://github.com/postlight/mercury-parser/commit/71c4d05037)] - **chore**: Include "src/shims" for webpack builds for web (#302) (Alexsander Akers)
+- [[`a3fe02678c`](https://github.com/postlight/mercury-parser/commit/a3fe02678c)] - **chore**: small CoC typofix (#358) (Frankie Simms)
+- [[`437f50a5c8`](https://github.com/postlight/mercury-parser/commit/437f50a5c8)] - **fix**: Initialize Content-Type as empty string if not present (#359) (John Holdun)
+- [[`da9a836eab`](https://github.com/postlight/mercury-parser/commit/da9a836eab)] - **chore**: remove unneeded import (#357) (Frankie Simms)
+- [[`bafa764000`](https://github.com/postlight/mercury-parser/commit/bafa764000)] - **chore**: set up ciftr for failed test reports (#343) (Frankie Simms)
+- [[`262dda94b3`](https://github.com/postlight/mercury-parser/commit/262dda94b3)] - **fix**: explicity reject non-200 status codes (#342) (Toufic Mouallem)
+- [[`b6c82f2b16`](https://github.com/postlight/mercury-parser/commit/b6c82f2b16)] - **docs**: fix extend typo in README (#340) (Drew Bell)
+- [[`144a797564`](https://github.com/postlight/mercury-parser/commit/144a797564)] - **feat**: Support passing custom headers in requests (#337) (Toufic Mouallem)
+- [[`3ed778b53e`](https://github.com/postlight/mercury-parser/commit/3ed778b53e)] - **fix**: Adapt CNBC extractor to article redesign (#336) (Toufic Mouallem)
+- [[`da9606a4cb`](https://github.com/postlight/mercury-parser/commit/da9606a4cb)] - **docs**: Add parsing custom HTML to README.md (#326) (Toufic Mouallem)
+- [[`b3e2a0ffd1`](https://github.com/postlight/mercury-parser/commit/b3e2a0ffd1)] - **feat**: extract custom types with extend option (#313) (Drew Bell)
+- [[`136d6df798`](https://github.com/postlight/mercury-parser/commit/136d6df798)] - **feat**: Return specific errors on failed parse attempts (Toufic Mouallem)
+- [[`a250f403f5`](https://github.com/postlight/mercury-parser/commit/a250f403f5)] - **fix**: Preserve whitespace in certain HTML elements (#333) (Toufic Mouallem)
+- [[`2a3ade706d`](https://github.com/postlight/mercury-parser/commit/2a3ade706d)] - **fix**: run parser preview (Adam Pash)
+- [[`a7e4c67d1d`](https://github.com/postlight/mercury-parser/commit/a7e4c67d1d)] - **feat**: Extract content from GitHub repos. (#306) (Ben Ubois)
+- [[`6e66887048`](https://github.com/postlight/mercury-parser/commit/6e66887048)] - **docs**: add content formats to README.md (#318) (Matthew Watkins)
+- [[`0940971069`](https://github.com/postlight/mercury-parser/commit/0940971069)] - **fix**: better handling for responsive images (#312) (Toufic Mouallem)
+- [[`785a22245f`](https://github.com/postlight/mercury-parser/commit/785a22245f)] - **feat**: switch from forked request to postman-request (#319) (Drew Bell)
+- [[`7844129fda`](https://github.com/postlight/mercury-parser/commit/7844129fda)] - **feat**: Add custom parser for Reddit (#307) (Toufic Mouallem)
+- [[`13581cd899`](https://github.com/postlight/mercury-parser/commit/13581cd899)] - **feat**: upgrade watchify to remove vulnerable hoek dep (#320) (Drew Bell)
+- [[`91fb0dfb46`](https://github.com/postlight/mercury-parser/commit/91fb0dfb46)] - **fix**: update parse signature in tests (#315) (Drew Bell)
+- [[`ffb25f34d7`](https://github.com/postlight/mercury-parser/commit/ffb25f34d7)] - **docs**: add usage gif (#308) (Adam Pash)
+- [[`9714cb70c5`](https://github.com/postlight/mercury-parser/commit/9714cb70c5)] - **feat**: Use Deadspin parser for all Kinja websites (#304) (Toufic Mouallem)
+- [[`83d1c2401b`](https://github.com/postlight/mercury-parser/commit/83d1c2401b)] - **feat**: add custom extractor for blisterreview.com (#299) (Jordan Hotmann)
+- [[`d9a1e7b22b`](https://github.com/postlight/mercury-parser/commit/d9a1e7b22b)] - **feat**: add news.mynavi.jp custom parser (#287) (kik0220)
+- [[`44a7ec791d`](https://github.com/postlight/mercury-parser/commit/44a7ec791d)] - **docs**: typofix (#300) (Olli Sulopuisto)
+- [[`0a15a37f04`](https://github.com/postlight/mercury-parser/commit/0a15a37f04)] - **fix**: ci artifact paths (#301) (Adam Pash)
+- [[`9698d9a0c4`](https://github.com/postlight/mercury-parser/commit/9698d9a0c4)] - **dx**: comment on custom parser pr fix (#278) (Adam Pash)
+- [[`ed14203e97`](https://github.com/postlight/mercury-parser/commit/ed14203e97)] - **fix**: return early if creating the resource failed. (#285) (Ben Ubois)
+- [[`52dfdda553`](https://github.com/postlight/mercury-parser/commit/52dfdda553)] - **deps**: Update mocha to the latest version 🚀 (#282) (greenkeeper[bot])
+- [[`b044cfa958`](https://github.com/postlight/mercury-parser/commit/b044cfa958)] - **release**: 2.0.0 (#275) (Adam Pash)
+
 ### 2.0.0 (Feb 13, 2019)
 
 ##### Commits
diff --git a/dist/mercury.js b/dist/mercury.js
@@ -220,13 +220,13 @@ function get(options) {
     });
   });
 } // Evaluate a response to ensure it's something we should be keeping.
-// This does not validate in the sense of a response being 200 level or
-// not. Validation here means that we haven't found reason to bail from
+// This does not validate in the sense of a response being 200 or not.
+// Validation here means that we haven't found reason to bail from
 // further processing of this url.
 
 
 function validateResponse(response) {
-  var parseNon2xx = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
+  var parseNon200 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
 
   // Check if we got a valid status code
   // This isn't great, but I'm requiring a statusMessage to be set
@@ -237,8 +237,8 @@ function validateResponse(response) {
   if (response.statusMessage && response.statusMessage !== 'OK' || response.statusCode !== 200) {
     if (!response.statusCode) {
       throw new Error("Unable to fetch content. Original exception was ".concat(response.error));
-    } else if (!parseNon2xx) {
-      throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-2xx level status codes."));
+    } else if (!parseNon200) {
+      throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-200 status codes."));
     }
   }
 
@@ -1248,6 +1248,7 @@ function absolutize($, rootUrl, attr) {
   $("[".concat(attr, "]")).each(function (_, node) {
     var attrs = getAttrs(node);
     var url = attrs[attr];
+    if (!url) return;
     var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
     setAttr(node, attr, absoluteUrl);
   });
@@ -1646,7 +1647,8 @@ var Resource = {
   generateDoc: function generateDoc(_ref) {
     var content = _ref.body,
         response = _ref.response;
-    var contentType = response.headers['content-type']; // TODO: Implement is_text function from
+    var _response$headers$con = response.headers['content-type'],
+        contentType = _response$headers$con === void 0 ? '' : _response$headers$con; // TODO: Implement is_text function from
     // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
 
     if (!contentType.includes('html') && !contentType.includes('text')) {
@@ -4832,6 +4834,236 @@ var WwwRedditComExtractor = {
   }
 };
 
+var OtrsComExtractor = {
+  domain: 'otrs.com',
+  title: {
+    selectors: ['#main article h1']
+  },
+  author: {
+    selectors: ['div.dateplusauthor a']
+  },
+  date_published: {
+    selectors: [['meta[name="article:published_time"]', 'value']]
+  },
+  dek: {
+    selectors: [['meta[name="og:description"]', 'value']]
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['#main article'],
+    defaultCleaner: false,
+    transforms: {},
+    clean: ['div.dateplusauthor', 'div.gr-12.push-6.footershare', '#atftbx', 'div.category-modul']
+  }
+};
+
+var WwwOssnewsJpExtractor = {
+  domain: 'www.ossnews.jp',
+  title: {
+    selectors: ['#alpha-block h1.hxnewstitle']
+  },
+  author: null,
+  date_published: null,
+  dek: null,
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['#alpha-block .section:has(h1.hxnewstitle)'],
+    defaultCleaner: false,
+    transforms: {},
+    clean: []
+  }
+};
+
+var BuzzapJpExtractor = {
+  domain: 'buzzap.jp',
+  title: {
+    selectors: ['h1.entry-title']
+  },
+  author: null,
+  date_published: {
+    selectors: [['time.entry-date', 'datetime']]
+  },
+  dek: null,
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['div.ctiframe'],
+    defaultCleaner: false,
+    transforms: {},
+    clean: []
+  }
+};
+
+var WwwAsahiComExtractor = {
+  domain: 'www.asahi.com',
+  title: {
+    selectors: ['.ArticleTitle h1']
+  },
+  author: {
+    selectors: [['meta[name="article:author"]', 'value']]
+  },
+  date_published: {
+    selectors: [['meta[name="pubdate"]', 'value']]
+  },
+  dek: null,
+  excerpt: {
+    selectors: [['meta[name="og:description"]', 'value']]
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['#MainInner div.ArticleBody'],
+    defaultCleaner: false,
+    transforms: {},
+    clean: ['div.AdMod', 'div.LoginSelectArea']
+  }
+};
+
+var WwwSanwaCoJpExtractor = {
+  domain: 'www.sanwa.co.jp',
+  title: {
+    selectors: ['#newsContent h1']
+  },
+  author: null,
+  date_published: null,
+  dek: {
+    selectors: [['meta[name="og:description"]', 'value']]
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['#newsContent'],
+    defaultCleaner: false,
+    transforms: {},
+    clean: ['#smartphone', 'div.sns_box', 'div.contentFoot']
+  }
+};
+
+var WwwElecomCoJpExtractor = {
+  domain: 'www.elecom.co.jp',
+  title: {
+    selectors: ['title']
+  },
+  author: null,
+  date_published: null,
+  dek: null,
+  lead_image_url: null,
+  content: {
+    selectors: ['td.TableMain2'],
+    defaultCleaner: false,
+    transforms: {
+      table: function table($node) {
+        $node.attr('width', 'auto');
+      }
+    },
+    clean: []
+  }
+};
+
+var ScanNetsecurityNeJpExtractor = {
+  domain: 'scan.netsecurity.ne.jp',
+  title: {
+    selectors: ['header.arti-header h1.head']
+  },
+  author: null,
+  date_published: {
+    selectors: [['meta[name="article:modified_time"]', 'value']]
+  },
+  dek: {
+    selectors: ['header.arti-header p.arti-summary']
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['div.arti-content.arti-content--thumbnail'],
+    defaultCleaner: false,
+    transforms: {},
+    clean: ['aside.arti-giga']
+  }
+};
+
+var JvndbJvnJpExtractor = {
+  domain: 'jvndb.jvn.jp',
+  title: {
+    selectors: ['title']
+  },
+  author: null,
+  date_published: null,
+  dek: null,
+  lead_image_url: null,
+  content: {
+    selectors: ['#news-list'],
+    defaultCleaner: false,
+    transforms: {},
+    clean: []
+  }
+};
+
+var GeniusComExtractor = {
+  domain: 'genius.com',
+  title: {
+    selectors: ['h1']
+  },
+  author: {
+    selectors: ['h2 a']
+  },
+  date_published: {
+    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
+      var json = JSON.parse(res);
+      return json.song.release_date;
+    }]]
+  },
+  dek: {
+    selectors: [// enter selectors
+    ]
+  },
+  lead_image_url: {
+    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
+      var json = JSON.parse(res);
+      return json.song.album.cover_art_url;
+    }]]
+  },
+  content: {
+    selectors: ['.lyrics'],
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: []
+  }
+};
+
+var WwwJnsaOrgExtractor = {
+  domain: 'www.jnsa.org',
+  title: {
+    selectors: ['#wgtitle h2']
+  },
+  author: null,
+  date_published: null,
+  dek: null,
+  excerpt: {
+    selectors: [['meta[name="og:description"]', 'value']]
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['#main_area'],
+    transforms: {},
+    clean: ['#pankuzu', '#side']
+  }
+};
+
 
 
 var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -4931,7 +5163,17 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
   BlisterreviewComExtractor: BlisterreviewComExtractor,
   NewsMynaviJpExtractor: NewsMynaviJpExtractor,
   GithubComExtractor: GithubComExtractor,
-  WwwRedditComExtractor: WwwRedditComExtractor
+  WwwRedditComExtractor: WwwRedditComExtractor,
+  OtrsComExtractor: OtrsComExtractor,
+  WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
+  BuzzapJpExtractor: BuzzapJpExtractor,
+  WwwAsahiComExtractor: WwwAsahiComExtractor,
+  WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
+  WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
+  ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
+  JvndbJvnJpExtractor: JvndbJvnJpExtractor,
+  GeniusComExtractor: GeniusComExtractor,
+  WwwJnsaOrgExtractor: WwwJnsaOrgExtractor
 });
 
 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
@@ -6389,14 +6631,16 @@ function select(opts) {
   // extract the attr
 
   if (_Array$isArray(matchingSelector)) {
-    var _matchingSelector = _slicedToArray(matchingSelector, 2),
+    var _matchingSelector = _slicedToArray(matchingSelector, 3),
         selector = _matchingSelector[0],
-        attr = _matchingSelector[1];
+        attr = _matchingSelector[1],
+        transform = _matchingSelector[2];
 
     $match = $(selector);
     $match = transformAndClean($match);
     result = $match.map(function (_, el) {
-      return $(el).attr(attr).trim();
+      var item = $(el).attr(attr).trim();
+      return transform ? transform(item) : item;
     });
   } else {
     $match = $(matchingSelector);
diff --git a/dist/mercury.web.js b/dist/mercury.web.js
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@postlight/mercury-parser",
-  "version": "2.0.0",
+  "version": "2.1.0",
   "description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
   "author": "Postlight <mercury@postlight.com>",
   "homepage": "https://mercury.postlight.com",

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@postlight/mercury-parser",`
`3`		`- "version": "2.0.0",`
	`3`	`+ "version": "2.1.0",`
`4`	`4`	`"description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",`
`5`	`5`	`"author": "Postlight <[email protected]>",`
`6`	`6`	`"homepage": "https://mercury.postlight.com",`