title: //div[@class="post-title"]//h1 author: //article//div[contains(@class, 'is-hidden-mobile')]//a[contains(@href, 'https://www.numerama.com/author/')][contains(@class, 'is-family-secondary')] # Need html5lib or replace_string to handle correctly badly included in-content-ad inclusion script. html5lib is a lot slower. # parser: html5lib replace_string(""): "</div>" body: //div[contains(@class, 'post-content')] strip: //span[@class='summary-entry'] strip: //div[contains(@class, 'article-footer')] strip: //footer strip: //noscript strip: //div[@class="embedded-tag-container"] strip: //p[contains(@class, 'humanoid-videos--title')] strip: //a[contains(@class, 'humanoid-videos__video-title')] strip_id_or_class: js-cookies-bar # Here we strip a malformed menu they insert sometimes that breaks tidying and pruning strip_id_or_class: index-menu test_url: https://www.numerama.com/sciences/818229-une-loi-mathematique-a-ete-decouverte-dans-les-dunes-de-sable-sur-terre-et-sur-mars.html test_url: http://www.numerama.com/sciences/243352-hubble-detecte-un-trou-noir-supermassif-propulse-hors-de-sa-galaxie.html test_url: http://www.numerama.com/tech/242703-free-mobile-et-la-4g-en-illimite-ce-quil-faut-savoir.html # Don't know why this one get everything in bold: test_url: http://www.numerama.com/business/243686-comme-convenu-quand-lenfer-dune-startup-se-transforme-en-succes-dauto-edition.html