diff options
author | Brendan Long <self@brendanlong.com> | 2018-12-07 04:04:42 +0300 |
---|---|---|
committer | Brendan Long <self@brendanlong.com> | 2018-12-07 04:04:42 +0300 |
commit | 50e50e67216dc9d3556afae5f7bef7e4b290ee73 (patch) | |
tree | 69e22e04503e168b1ba8936623209d8041193f5b | |
parent | 1fd2464d33b059c12f5cc3b35278426902e8774c (diff) | |
parent | d573b791800f7d5eadcbbacccdff1a64f39da6a3 (diff) |
Merge commit 'd573b791800f7d5eadcbbacccdff1a64f39da6a3' as 'data/GrabberConfig'
1259 files changed, 13427 insertions, 0 deletions
diff --git a/data/GrabberConfig/.about.com.txt b/data/GrabberConfig/.about.com.txt new file mode 100644 index 00000000..3953d3e5 --- /dev/null +++ b/data/GrabberConfig/.about.com.txt @@ -0,0 +1,3 @@ +title: //*[@id='title']//h1 +body: //*[(@id = "articlebody")] +test_url: http://nutrition.about.com/od/changeyourdiet/qt/healthysnacks.htm
\ No newline at end of file diff --git a/data/GrabberConfig/.allthingsd.com.txt b/data/GrabberConfig/.allthingsd.com.txt new file mode 100644 index 00000000..ec89c0da --- /dev/null +++ b/data/GrabberConfig/.allthingsd.com.txt @@ -0,0 +1,6 @@ +body: //div[@id='content-left']/div[@class='post'] +strip_id_or_class: social +strip_id_or_class: atd-disqus-disclaimer +tidy: no + +test_url: http://mediamemo.allthingsd.com/20110516/bit-ly-gets-a-new-boss/
\ No newline at end of file diff --git a/data/GrabberConfig/.blog.163.com.txt b/data/GrabberConfig/.blog.163.com.txt new file mode 100644 index 00000000..22092f80 --- /dev/null +++ b/data/GrabberConfig/.blog.163.com.txt @@ -0,0 +1,26 @@ +# To administrator: +# Please replace the hostname with "*.blog.163.com" + +# This filter is tested on: +# http://wangzeke.blog.163.com/blog/static/933015402012410105922228/ +# http://wavow.blog.163.com/blog/static/532284320124117211245/ +# http://elainejeff.blog.163.com/blog/static/1671902912012498727253/ + + +strip://*[contains(@class, 'mcnt ztag')]//span[@style = 'display:none;'] +strip://*[contains(@id, 'divTopLink')] +strip://*[contains(@class, 'phide')] +strip://*[contains(@class, 'thide')] +strip://*[contains(@id, 'topbar')] +strip://*[contains(@class, 'tbar')] +strip://*[contains(@class, 'snl')] +strip://*[contains(@id, 'banner')] + + +title://h3 +author://span[contains(@class, 'ztag pre')] +date://span[contains(@class, 'blogsep')] +body://div[contains(@class, 'mcnt ztag')] + +convert_double_br_tags: yes +test_url: http://elainejeff.blog.163.com/blog/static/167190291201381121654580/ diff --git a/data/GrabberConfig/.blogs.nytimes.com.txt b/data/GrabberConfig/.blogs.nytimes.com.txt new file mode 100644 index 00000000..6517463e --- /dev/null +++ b/data/GrabberConfig/.blogs.nytimes.com.txt @@ -0,0 +1,17 @@ +body: //div[@class='entry-content'] +title: //h1[@class='entry-title'] +# Two author lines because krugman.blogs.nytimes.com is a special case +author: substring-after(//div[@class="box module nocontent"]/h4, "About ") +author: //address/a +date: //meta[@name="PUD"]/@content +date: //*[@class='date'] + +#Removes related content but cleans up article text +strip: //ul[@class='toolsList wrap'] +strip_id_or_class:inlineModule +strip_id_or_class:module +strip_id_or_class:toolsListContainer +prune: no +test_url: http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/ +test_url: http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/ +test_url: http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/
\ No newline at end of file diff --git a/data/GrabberConfig/.blogspot.com.txt b/data/GrabberConfig/.blogspot.com.txt new file mode 100644 index 00000000..d04b7683 --- /dev/null +++ b/data/GrabberConfig/.blogspot.com.txt @@ -0,0 +1,11 @@ +date: //*[contains(@class, 'date-header')] +title://*[contains(@class,'post-title')] +body://div[contains(@class,'post-body')] +body://div[contains(@class,'entry-content')] +strip_comments:no +prune:no + +tidy:yes + +test_url: http://themerryone.blogspot.com/2010/08/new-move-new-blog.html +test_url: http://strobist.blogspot.com/2012/01/qa-down-phase-one-rabbit-hole.html
\ No newline at end of file diff --git a/data/GrabberConfig/.businessinsider.com.txt b/data/GrabberConfig/.businessinsider.com.txt new file mode 100644 index 00000000..43aede21 --- /dev/null +++ b/data/GrabberConfig/.businessinsider.com.txt @@ -0,0 +1,9 @@ +title://div[@class="sl-layout-post"]/h1 +body: //div[contains(@class, 'post-content') or contains(@class, 'KonaBody')] +strip: //div[contains(@class, "post-sidebar")] +strip: //div[@id='related-links'] +author://div[@class="byline"]/a +date://div[@class="byline"]/span[@class="date"] +prune: no + +test_url: http://www.businessinsider.com/as-europe-booms-on-bailout-deal-john-boehner-just-confirmed-that-the-us-is-nowhere-2011-7
\ No newline at end of file diff --git a/data/GrabberConfig/.chicagotribune.com.txt b/data/GrabberConfig/.chicagotribune.com.txt new file mode 100644 index 00000000..55a9ed2f --- /dev/null +++ b/data/GrabberConfig/.chicagotribune.com.txt @@ -0,0 +1,8 @@ +date: //span[@class='pubdate'] +author: //div[@id='mod-article-byline']/span[3] +body: //div[@id='area-article-first-block'] | //div[@id='mod-a-body-after-first-para'] +strip_id_or_class: byline + +strip: //div[@id='mod-article-byline'] + +test_url: http://www.chicagotribune.com/classified/automotive/used/chi-auto-refinance-pros-cons-20130513,0,4116070.story diff --git a/data/GrabberConfig/.cnet.com.txt b/data/GrabberConfig/.cnet.com.txt new file mode 100644 index 00000000..eac08aaa --- /dev/null +++ b/data/GrabberConfig/.cnet.com.txt @@ -0,0 +1,16 @@ +title: //meta[@property="og:title"]/@content +body: //div[contains(@class, 'postBody')] +date: //div[@id='nameAndTime']/time +author: //div[@id='nameAndTime']/span[@class='author'] + +strip_id_or_class: image-credit +strip_id_or_class: noAutolink +strip_id_or_class: related + +prune: no +tidy: no + +# early end +replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html> + +test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/
\ No newline at end of file diff --git a/data/GrabberConfig/.craigslist.org.txt b/data/GrabberConfig/.craigslist.org.txt new file mode 100644 index 00000000..a39aa35e --- /dev/null +++ b/data/GrabberConfig/.craigslist.org.txt @@ -0,0 +1,8 @@ +title: //h2[@class='postingtitle'] +date: //p[@class='postinginfo']/date + +body: //figure[@class='iw'] | //section[@class='cltags' or @id='postingbody'] +prune: no +tidy: no + +test_url: http://sfbay.craigslist.org/hhh/index.rss
\ No newline at end of file diff --git a/data/GrabberConfig/.ctv.ca.txt b/data/GrabberConfig/.ctv.ca.txt new file mode 100644 index 00000000..e12fc65a --- /dev/null +++ b/data/GrabberConfig/.ctv.ca.txt @@ -0,0 +1,8 @@ +title: //h3[@class='jhl'] +body: //div[@class='storyBody'] +strip: //p[contains(., 'Please Add Comments')]//following-sibling::* +strip: //p[contains(., 'Please Add Comments')] +strip: //p[em[contains(., 'This story has been updated from its original version')]] +strip: //hr + +test_url: http://montreal.ctv.ca/servlet/an/local/CTVNews/20110914/mtl_construction_110914/20110915?hub=MontrealHome
\ No newline at end of file diff --git a/data/GrabberConfig/.dreamwidth.org.txt b/data/GrabberConfig/.dreamwidth.org.txt new file mode 100644 index 00000000..726e30d6 --- /dev/null +++ b/data/GrabberConfig/.dreamwidth.org.txt @@ -0,0 +1,7 @@ +# Please convert this to *.dreamwidth.org, as users receive individual subdomains. +convert_double_br_tags: yes +strip_image_src: 'dreamwidth.org' +strip_id_or_class: 'currents' +title: //div[contains(@id, 'entrysubj')] +body: //div[contains(@class, 'usercontent')] +test_url: http://dw-news.dreamwidth.org/28922.html
\ No newline at end of file diff --git a/data/GrabberConfig/.dxy.cn.txt b/data/GrabberConfig/.dxy.cn.txt new file mode 100644 index 00000000..d567b66f --- /dev/null +++ b/data/GrabberConfig/.dxy.cn.txt @@ -0,0 +1,4 @@ +body: //div[@id='content'] +prune: no + +test_url: http://neurosurg.dxy.cn/article/87224
\ No newline at end of file diff --git a/data/GrabberConfig/.elpais.com.txt b/data/GrabberConfig/.elpais.com.txt new file mode 100644 index 00000000..d1ada718 --- /dev/null +++ b/data/GrabberConfig/.elpais.com.txt @@ -0,0 +1,23 @@ +title: //meta[@name='DC.title']/@content +title: //div[contains(@class, 'cabecera_noticia')]//h1 +date: //meta[@name='DC.date']/@content +date: //meta[@name='date']/@content +body: //div[@class='columna_texto'] +body: //div[@id='cuerpo_noticia'] +body: //div[@class='estructura_2col_1zq']//div[@class='margen_n'] + +prune: no + +strip_id_or_class: disposicion_vertical +strip_id_or_class: ampliar_foto +strip_id_or_class: utilidades +strip_id_or_class: info_relacionada +strip_id_or_class: m-kiosko +strip_id_or_class: info_complementa + +strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] +strip: //div[@id='coment' or @id='foros_not'] +strip: //picture//source + +test_url: http://economia.elpais.com/economia/2012/02/07/actualidad/1328611790_342868.html +test_url: http://internacional.elpais.com/internacional/2012/02/07/actualidad/1328602145_448315.html diff --git a/data/GrabberConfig/.etc.se.txt b/data/GrabberConfig/.etc.se.txt new file mode 100644 index 00000000..051270ea --- /dev/null +++ b/data/GrabberConfig/.etc.se.txt @@ -0,0 +1,11 @@ +body: //div[contains(@class, 'body-preamble')] +date: //meta[@name='dcterms.date']/@content +author: //meta[@name='dcterms.creator']/@content + +strip_id_or_class: top-link + +test_url: http://stockholm.etc.se/debatt/jobbgaraget-gav-hopp-ungdomar-i-tensta +test_contains: Jobbgaraget startades av Angeles + +test_url: http://goteborg.etc.se/inrikes/snart-oppnar-stans-forsta-ekogalleria +test_contains: Krasnapolsky och just nu diff --git a/data/GrabberConfig/.ew.com.txt b/data/GrabberConfig/.ew.com.txt new file mode 100644 index 00000000..c4b886f0 --- /dev/null +++ b/data/GrabberConfig/.ew.com.txt @@ -0,0 +1,14 @@ +next_page_link: //span[@class='paging-next']/a[contains(., 'NEXT')] +strip_id_or_class: article-paging +strip_id_or_class: eyebrow +strip_id_or_class: underbar +strip_id_or_class: extras +strip_id_or_class: share +strip_id_or_class: recap-links +strip_id_or_class: tvr-author +strip_id_or_class: pub-date +strip_id_or_class: post-title + +title: //h1[@class='post-title'] + +test_url: http://tvrecaps.ew.com/recap/fringe-season-4-episode-2/
\ No newline at end of file diff --git a/data/GrabberConfig/.finance.yahoo.com.txt b/data/GrabberConfig/.finance.yahoo.com.txt new file mode 100644 index 00000000..a10b86cd --- /dev/null +++ b/data/GrabberConfig/.finance.yahoo.com.txt @@ -0,0 +1,12 @@ +title: //meta[@property='og:title']/@content +body: //div[@id='y-article-bd'] +body: //div[contains(@class, 'yom-art-content')] +strip: //div[contains(@class, 'related-companies')] +strip: //div[@id='y-article-related'] +strip: //div[@id='ypf-article-related'] +prune: no + +single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')] + +test_url: https://sg.finance.yahoo.com/news/former-xstrata-bosss-venture-raises-095622450.html +test_contains: The former boss of mining company Xstrata diff --git a/data/GrabberConfig/.fivefilters.org.txt b/data/GrabberConfig/.fivefilters.org.txt new file mode 100644 index 00000000..7afc629c --- /dev/null +++ b/data/GrabberConfig/.fivefilters.org.txt @@ -0,0 +1,6 @@ +# Pastepad specific +title: //div[@id='ff-pastepad-content']//h1 +body: //div[@id='ff-pastepad-content'] +prune: no +# todo: add test file +test_url: http://pastepad.fivefilters.org/test.html diff --git a/data/GrabberConfig/.fok.nl.txt b/data/GrabberConfig/.fok.nl.txt new file mode 100644 index 00000000..e6ad7d52 --- /dev/null +++ b/data/GrabberConfig/.fok.nl.txt @@ -0,0 +1,13 @@ +title: //h1[@class='title'] +body: //div[@id='itemBody'] + +strip_id_or_class: itemFooter + +replace_string(90%;">Lees ook): 0%;"></h3></div></body></html> + +tidy: no +prune: no + +http_header(user-agent): Googlebot/2.1 + +test_url: http://sport.fok.nl/nieuws/548725/1/1/50/rosicky-traint-weer-mee-bij-tsjechie.html
\ No newline at end of file diff --git a/data/GrabberConfig/.gitattributes b/data/GrabberConfig/.gitattributes new file mode 100644 index 00000000..412eeda7 --- /dev/null +++ b/data/GrabberConfig/.gitattributes @@ -0,0 +1,22 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp +*.sln merge=union +*.csproj merge=union +*.vbproj merge=union +*.fsproj merge=union +*.dbproj merge=union + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/data/GrabberConfig/.gitignore b/data/GrabberConfig/.gitignore new file mode 100644 index 00000000..43c1b6f8 --- /dev/null +++ b/data/GrabberConfig/.gitignore @@ -0,0 +1,170 @@ +################# +## Full-Text RSS +################# +version.php +version.txt +index.php + +################# +## Eclipse +################# + +*.pydevproject +.project +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + + +################# +## Visual Studio +################# + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results +[Dd]ebug/ +[Rr]elease/ +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.vspscc +.builds +*.dotCover + +## TODO: If you have NuGet Package Restore enabled, uncomment this +#packages/ + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf + +# Visual Studio profiler +*.psess +*.vsp + +# ReSharper is a .NET coding add-in +_ReSharper* + +# Installshield output folder +[Ee]xpress + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish + +# Others +[Bb]in +[Oo]bj +sql +TestResults +*.Cache +ClientBin +stylecop.* +~$* +*.dbmdl +Generated_Code #added for RIA/Silverlight projects + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML + + + +############ +## Windows +############ + +# Windows image file caches +Thumbs.db + +# Folder config file +Desktop.ini + + +############# +## Python +############# + +*.py[co] + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox + +#Translations +*.mo + +#Mr Developer +.mr.developer.cfg + +# Mac crap +.DS_Store diff --git a/data/GrabberConfig/.hardware.info.txt b/data/GrabberConfig/.hardware.info.txt new file mode 100644 index 00000000..82c33790 --- /dev/null +++ b/data/GrabberConfig/.hardware.info.txt @@ -0,0 +1,6 @@ +title: //h1[@itemprop='headline'] +author: //span[@itemprop='author'] +body: //article[@itemprop='articleBody'] +date: //span[@itemprop='datePublished']/@content + +test_url: http://be.hardware.info/reviews/6197/game-pcs-van-0-25-en-5-jaar-oud-review-tijd-om-te-upgraden diff --git a/data/GrabberConfig/.ifeng.com.txt b/data/GrabberConfig/.ifeng.com.txt new file mode 100644 index 00000000..f8b7993b --- /dev/null +++ b/data/GrabberConfig/.ifeng.com.txt @@ -0,0 +1,14 @@ +# Please change host to *.ifeng.com +# same config works well on other subdomains +# tested on following links +# http://phtv.ifeng.com/program/qqsrx/detail_2012_11/28/19613849_0.shtml +# http://finance.ifeng.com/news/corporate/20121128/7359279.shtml + +tidy:no + +title://h1[contains(@id,'artical_topic')] + +body://div[contains(@id,'artical_real')] + +next_page_link://*[contains(@id,'pagenext')] +test_url: http://news.ifeng.com/history/zhongguojindaishi/detail_2012_04/01/13605159_0.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/.livejournal.com.txt b/data/GrabberConfig/.livejournal.com.txt new file mode 100644 index 00000000..551ace47 --- /dev/null +++ b/data/GrabberConfig/.livejournal.com.txt @@ -0,0 +1,6 @@ +title: //title +strip_image_src: 'l-stat.livejournal.com' +strip_image_src: 'www.livejournal.com' +strip_image_src: 'l-userpic.livejournal.com' +test_url: http://news.livejournal.com/136664.html +test_url: http://stelazin.livejournal.com/91363.html
\ No newline at end of file diff --git a/data/GrabberConfig/.m.wikihow.com.txt b/data/GrabberConfig/.m.wikihow.com.txt new file mode 100644 index 00000000..5be49fe1 --- /dev/null +++ b/data/GrabberConfig/.m.wikihow.com.txt @@ -0,0 +1,17 @@ +# ...&printable=yes +body: //div[@id='bodycontents'] +# m.wikihow.com/... +body: //div[@id='article'] +prune: no +tidy: no +strip_id_or_class: gatEditSection +strip_id_or_class: relatedwikihows +#strip: //div[contains(@class, 'step_num')] + +replace_string(<script ): <div style="display: none" +replace_string(</script>): </div> + +single_page_link: //a[@id='gatPrintView'] +single_page_link: concat(//link[@rel='canonical']/@href, '?printable=yes') + +test_url: http://pt.m.wikihow.com/Criar-um-Script-de-Login-Seguro-em-PHP-e-MySQL
\ No newline at end of file diff --git a/data/GrabberConfig/.metafilter.com.txt b/data/GrabberConfig/.metafilter.com.txt new file mode 100644 index 00000000..3b4e121b --- /dev/null +++ b/data/GrabberConfig/.metafilter.com.txt @@ -0,0 +1,9 @@ +body: //div[contains(@class, 'copy') or contains(@class, 'comments')] +strip_id_or_class: related +strip_id_or_class: whitesmallcopy +strip: //a[. = 'Subscribe'] +strip: //h1/span[@class = 'smallcopy'] +strip: //a[@class = 'skip'] +strip: //div[@id = 'logo'] +strip: //div[contains(@class, 'comments') and contains(., 'You are not currently logged in')] +test_url: http://ask.metafilter.com/159539/Connect-ZERO-I-feel-like-an-idiot
\ No newline at end of file diff --git a/data/GrabberConfig/.mozilla.org.txt b/data/GrabberConfig/.mozilla.org.txt new file mode 100644 index 00000000..091aa76c --- /dev/null +++ b/data/GrabberConfig/.mozilla.org.txt @@ -0,0 +1,5 @@ +strip_id_or_class: comments +prune: no +date: //p[@class="entry-posted"]//abbr[@class="published"]/@title + +test_url: https://hacks.mozilla.org/2013/05/how-to-spread-the-word-about-your-code/
\ No newline at end of file diff --git a/data/GrabberConfig/.news.yahoo.com.txt b/data/GrabberConfig/.news.yahoo.com.txt new file mode 100644 index 00000000..84ea85e2 --- /dev/null +++ b/data/GrabberConfig/.news.yahoo.com.txt @@ -0,0 +1,6 @@ +title: //h1[@class='headline'] +body: //cite[contains(@class,'byline')] | //div[contains(@class,'yom-art-content')] +strip: //cite/abbr +tidy: no + +test_url: http://ca.news.yahoo.com/cold-la-nina-winter-forecast-west-coast-183535067.html
\ No newline at end of file diff --git a/data/GrabberConfig/.nytimes.com.txt b/data/GrabberConfig/.nytimes.com.txt new file mode 100644 index 00000000..78096d59 --- /dev/null +++ b/data/GrabberConfig/.nytimes.com.txt @@ -0,0 +1,4 @@ +strip: //*[@id='insideNYTimesScrollWrapper'] | //*[contains(@class, 'articleInline')] +single_page_link: //li[@class='singlePage']/a +test_url: http://www.nytimes.com/2010/07/13/science/13gravity.html?_r=1&pagewanted=print +test_url: http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html?_r=1&hp
\ No newline at end of file diff --git a/data/GrabberConfig/.onliner.by.txt b/data/GrabberConfig/.onliner.by.txt new file mode 100644 index 00000000..84470050 --- /dev/null +++ b/data/GrabberConfig/.onliner.by.txt @@ -0,0 +1,9 @@ +title: //*[contains(@class, 'b-posts-1-item__title')] +body: //figure[contains(@class, 'b-posts-1-item__image')] | //div[contains(@class, 'b-posts-1-item__text')] +date: //article//time/@datetime + +tidy: no +prune: no + +test_url: http://tech.onliner.by/feed +test_url: http://tech.onliner.by/2014/04/24/the-amazing-spider-man-2
\ No newline at end of file diff --git a/data/GrabberConfig/.orf.at.txt b/data/GrabberConfig/.orf.at.txt new file mode 100644 index 00000000..83c4ec98 --- /dev/null +++ b/data/GrabberConfig/.orf.at.txt @@ -0,0 +1,32 @@ +single_page_link: //div[@id='ss-storyText']//p[@class='readMore']/a +single_page_link: //div[@id='ss-storyText']//p[contains(., 'Mehr dazu')]/a + +title: substring-before(//title,' - ') +body: //div[@id="ss-storyText"] +author: substring-before(//div[@id="ss-storyText"]//p[contains(text(), ', ORF.at')], ', ORF.at') +strip: //div[@id="ss-storyText"]//p[contains(text(), ', ORF.at')] +date: substring-after(//div[@class='storyMeta socialshare']//p[@class='date'],'Publiziert am') +strip: //p[@class='date'] + +strip: //p[@class='credit'] +strip: //p[@class='toplink'] +strip: //div[@id="ss-storyText"]/h1 +strip: //div[@class='socialButtons'] +strip: //div[@class='storyMeta socialshare'] +strip: //div[@class='socialShareWrapper'] +strip: //div[@id='socialshareprivacy'] +strip: //div[@class='storyMeta'] +strip: //div[@class='remote'] + +prune: no +tidy: no + +#test_url: http://orf.at/stories/2317355/ +#test_url: http://orf.at/stories/2084731/ +#test_url: http://orf.at/stories/2317313/2317311/ +test_url: http://wien.orf.at/news/stories/2746414/ +test_url: http://ooe.orf.at/news/stories/2750613/ +test_url: http://science.orf.at/stories/2774991/ +#test_url: http://orf.at/stories/2339962/ +#test_url: http://orf.at/stories/2339958/ +test_url: http://help.orf.at/stories/1770242/ diff --git a/data/GrabberConfig/.readthedocs.io.txt b/data/GrabberConfig/.readthedocs.io.txt new file mode 100644 index 00000000..40b0c0ad --- /dev/null +++ b/data/GrabberConfig/.readthedocs.io.txt @@ -0,0 +1,3 @@ +title: //h1 +body: //div[@role='main'] +test_url: http://docs.readthedocs.io/en/latest/getting_started.html diff --git a/data/GrabberConfig/.reuters.com.txt b/data/GrabberConfig/.reuters.com.txt new file mode 100644 index 00000000..74118ece --- /dev/null +++ b/data/GrabberConfig/.reuters.com.txt @@ -0,0 +1,11 @@ +title: //div[@id='maincontent']//h1 +body: //div[@id='resizeableText'] + +single_page_link: concat(//link[@rel='canonical']/@href, '?sp=true') + +test_url: http://cn.reuters.com/article/CNAnalysesNews/idCNKBS0FF0NM20140710 +test_url: http://cn.reuters.feedsportal.com/CNAnalysesNews +# multipage link +test_url: http://cn.reuters.com/article/idCNKBS0FF0UL20140710 +test_url: http://br.reuters.com/article/topNews/idBRKBN0JN1D420141209 +test_contains: Em 2015 a tendência é que diff --git a/data/GrabberConfig/.slashdot.org.txt b/data/GrabberConfig/.slashdot.org.txt new file mode 100644 index 00000000..a035e2d1 --- /dev/null +++ b/data/GrabberConfig/.slashdot.org.txt @@ -0,0 +1,16 @@ +title: //span[starts-with(@id, 'title-')] + +body: //div[starts-with(@id, 'fhbody-')] + +prune: no +tidy: no + +http_header(user-agent): PHP/5.3 + +# follow first link +#single_page_link: (//div[starts-with(@id, 'fhbody-')]//a[contains(@href, '://')])[1] + +test_url: http://apple.slashdot.org/story/13/03/21/1736239/new-os-x-trojan-adware-injects-ads-into-chrome-firefox-safari +test_url: http://developers.slashdot.org/story/13/03/23/1426215/will-donglegate-affect-your-decision-to-attend-pycon +# feed URL +test_url: http://rss.slashdot.org/Slashdot/slashdot diff --git a/data/GrabberConfig/.smashingmagazine.com.txt b/data/GrabberConfig/.smashingmagazine.com.txt new file mode 100644 index 00000000..bc25966b --- /dev/null +++ b/data/GrabberConfig/.smashingmagazine.com.txt @@ -0,0 +1,11 @@ +title://article[contains(@id, "post-")]/h2 +author://ul[@class="postmetadata clearfix"]/li[@class="author"] +date://ul[@class="postmetadata clearfix"]/li[@class="date"] +body://article[contains(@id, "post-")] +body: //article[contains(@class, "post-")] +strip_id_or_class: sot +strip://div[@class="ad ed"] +prune:yes + +test_url: http://wp.smashingmagazine.com/2012/11/08/complete-guide-custom-post-types/ +test_url: https://www.smashingmagazine.com/2017/04/browser-extension-edge-chrome-firefox-opera-brave-vivaldi/ diff --git a/data/GrabberConfig/.sputniknews.com.txt b/data/GrabberConfig/.sputniknews.com.txt new file mode 100644 index 00000000..f50737dd --- /dev/null +++ b/data/GrabberConfig/.sputniknews.com.txt @@ -0,0 +1,4 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' b-article__header ')] | //div[contains(@class, 'b-article__lead') or contains(@class, 'b-article__text') or contains(@class, 'b-article__videoclub_embed')] + +test_url: https://cz.sputniknews.com/videoklub/201708045748463-kletenka-portal-letuska-pilot/ +test_contains: Chocotravel zveřejnil diff --git a/data/GrabberConfig/.stackexchange.com.txt b/data/GrabberConfig/.stackexchange.com.txt new file mode 100644 index 00000000..c9d44b1d --- /dev/null +++ b/data/GrabberConfig/.stackexchange.com.txt @@ -0,0 +1,4 @@ +title: //title +body: //div[@id='question']//div[contains(@class,'post-text')] | //div[@id='answers-header']//h2 | //div[contains(@class,'accepted-answer')]//div[contains(@class,'post-text')] + +test_url: http://cstheory.stackexchange.com/questions/14811/what-is-the-enlightenment-im-supposed-to-attain-after-studying-finite-automata/14818#14818 diff --git a/data/GrabberConfig/.stanford.edu.txt b/data/GrabberConfig/.stanford.edu.txt new file mode 100644 index 00000000..96490315 --- /dev/null +++ b/data/GrabberConfig/.stanford.edu.txt @@ -0,0 +1,5 @@ +title: //div[@id='aueditable']/h1 +body: //div[@id='content'] +strip: //div[@id='message' or @id='linklist'] +prune: no +test_url: http://plato.stanford.edu/entries/supervenience/
\ No newline at end of file diff --git a/data/GrabberConfig/.theonion.com.txt b/data/GrabberConfig/.theonion.com.txt new file mode 100644 index 00000000..54e014a9 --- /dev/null +++ b/data/GrabberConfig/.theonion.com.txt @@ -0,0 +1,10 @@ +title: //h2[@class='title'] | //h1[contains(concat(' ',normalize-space(@class),' '),'headline')] +date: substring-before(//p[@class='meta'], '|') +body: //div[@class='article_body'] | //div[@class='story'] | //div[contains(concat(' ',normalize-space(@class),' '),'post-content')] + +strip: //h2[@class='title'] +strip: //p[@class='meta'] +strip: //div[@class='ga_section'] +strip: //div[@id='recent_slider'] + +test_url: https://politics.theonion.com/inconsolable-jeff-sessions-tries-to-commit-suicide-by-s-1826462420 diff --git a/data/GrabberConfig/.thueringer-allgemeine.de.txt b/data/GrabberConfig/.thueringer-allgemeine.de.txt new file mode 100644 index 00000000..77d5a51c --- /dev/null +++ b/data/GrabberConfig/.thueringer-allgemeine.de.txt @@ -0,0 +1,8 @@ +title: //div[@class='qp_headline']/h1 +body: //div[contains(@class, 'article')]//div[@class='qp_text'] +prune: no + +strip: //div[@id='_DetailPortlet_WAR_queport_zgtperson'] +strip: //div[@class='qp_embedded'] + +test_url: http://apolda.thueringer-allgemeine.de/web/apolda/startseite/detail/-/specific/Neue-Superknolle-beim-Heichelheimer-Kartoffelfest-praemiert-447764498
\ No newline at end of file diff --git a/data/GrabberConfig/.time.com.txt b/data/GrabberConfig/.time.com.txt new file mode 100644 index 00000000..40ee26f8 --- /dev/null +++ b/data/GrabberConfig/.time.com.txt @@ -0,0 +1,26 @@ +date: //meta[@name='date']/@content +author: //meta[@name='byline']/@content +date: //span[@class = 'date'] +title: //div[@id='print']//h1 +title: //h1[@class="entry-title"] +body: //article//div[contains(@class,'entry-content')] +strip: //div[@class='more-ways'] +strip: //div[@id = 'stayConnected'] +strip: //p[child::a[@rel = 'bookmark']] +strip: //p[starts-with(string(.),'(MORE:')] +strip: //p[starts-with(string(.),'(PHOTOS:')] +strip: //aside +#move_into(//p[../@class = 'entry-content'][position() = last()])://div[@id = 'featbox'] +prune: no + +replace_string(<noscript>): <div> +replace_string(</noscript>): </div> + + +strip: //div[@id='print']//div[contains(@class, 'thumbnail')] + +single_page_link: //footer//a[contains(@href, '/printout/')] + +test_url: http://content.time.com/time/specials/packages/article/0,28804,2094921_2094923_2094924,00.html +test_url: http://healthland.time.com/2011/07/24/amy-winehouse-and-the-pain-of-addiction/ +test_url: http://nation.time.com/2013/09/17/navy-yard-shooter-had-been-treated-for-mental-health-problems/
\ No newline at end of file diff --git a/data/GrabberConfig/.tweakblogs.net.txt b/data/GrabberConfig/.tweakblogs.net.txt new file mode 100644 index 00000000..784586cc --- /dev/null +++ b/data/GrabberConfig/.tweakblogs.net.txt @@ -0,0 +1,4 @@ +body: //div[@class="article"] +author: //p[@class="author"]/a + +test_url: http://harryl.tweakblogs.net/blog/11988/voorstellen diff --git a/data/GrabberConfig/.usinenouvelle.com.txt b/data/GrabberConfig/.usinenouvelle.com.txt new file mode 100644 index 00000000..380f2724 --- /dev/null +++ b/data/GrabberConfig/.usinenouvelle.com.txt @@ -0,0 +1,7 @@ +title: //h1[contains(concat(' ',normalize-space(@class),' '),' articleTitre ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' articleBrut ')] +date: //span[contains(concat(' ',normalize-space(@class),' '),' articleAuteurDate')]//time +author: //span[contains(concat(' ',normalize-space(@class),' '),' articleAuteurDate')]//span/a +prune: no + +test_url: http://www.usinenouvelle.com/article/froid-devant.N321392 diff --git a/data/GrabberConfig/.wikihow.com.txt b/data/GrabberConfig/.wikihow.com.txt new file mode 100644 index 00000000..42269d32 --- /dev/null +++ b/data/GrabberConfig/.wikihow.com.txt @@ -0,0 +1,17 @@ +# ...&printable=yes +body: //div[@id='bodycontents'] +prune: no +tidy: no +strip_id_or_class: gatEditSection +strip_id_or_class: relatedwikihows +strip_id_or_class: sp_method_toc +#strip: //div[contains(@class, 'step_num')] + +#replace_string(<script ): <div style="display: none" +#replace_string(</script>): </div> + +single_page_link: //a[@id='gatPrintView'] +single_page_link: concat(//link[@rel='canonical']/@href, '?printable=yes') + +test_url: http://pt.wikihow.com/Criar-um-Script-de-Login-Seguro-em-PHP-e-MySQL +test_url: https://de.wikihow.com/index.php?title=Handschrift-verbessern diff --git a/data/GrabberConfig/.wikimedia.org.txt b/data/GrabberConfig/.wikimedia.org.txt new file mode 100644 index 00000000..ff990c2c --- /dev/null +++ b/data/GrabberConfig/.wikimedia.org.txt @@ -0,0 +1,12 @@ +title: //h1[@id='firstHeading'] +body: //div[@id = 'bodyContent'] +strip_id_or_class: editsection +strip_id_or_class: toc +strip_id_or_class: vertical-navbox +strip: //div[@id='catlinks'] +strip: //div[@id='jump-to-nav'] +strip: //div[@class='thumbcaption']//div[@class='magnify'] +strip: //table[@class='navbox'] +prune: no +tidy: no +test_url: https://secure.wikimedia.org/wikipedia/en/wiki/Christopher_Lloyd
\ No newline at end of file diff --git a/data/GrabberConfig/.wikipedia.org.txt b/data/GrabberConfig/.wikipedia.org.txt new file mode 100644 index 00000000..1da400bc --- /dev/null +++ b/data/GrabberConfig/.wikipedia.org.txt @@ -0,0 +1,24 @@ +title: //h1[@id='firstHeading'] +body: //div[@id = 'bodyContent'] +strip_id_or_class: editsection +#strip_id_or_class: toc +strip_id_or_class: vertical-navbox +strip: //*[@id='toc'] +strip: //div[@id='catlinks'] +strip: //div[@id='jump-to-nav'] +strip: //div[@class='thumbcaption']//div[@class='magnify'] +strip: //table[@class='navbox'] +#strip: //table[contains(@class, 'infobox')] +strip: //div[@class='dablink'] +strip: //div[@id='contentSub'] +strip: //table[contains(@class, 'metadata')] +strip: //*[contains(@class, 'noprint')] +strip: //span[@class='noexcerpt'] + +http_header(user-agent): Mozilla/5.2 + +prune: no +tidy: no +test_url: http://en.wikipedia.org/wiki/Christopher_Lloyd +test_url: https://en.wikipedia.org/wiki/Ronnie_James_Dio +test_url: https://en.wikipedia.org/wiki/Metallica
\ No newline at end of file diff --git a/data/GrabberConfig/.wordpress.com.txt b/data/GrabberConfig/.wordpress.com.txt new file mode 100644 index 00000000..42e56a5c --- /dev/null +++ b/data/GrabberConfig/.wordpress.com.txt @@ -0,0 +1,27 @@ +# try to target content block within div#content +body: //div[@id="content"]//div[contains(@class, 'entry-content') or contains(@class, 'entrytext') or @class='main' or @class='entry'] +# if that fails, get div#content +body: //div[@id='content'] +title: //meta[@property='og:title']/@content + +date: //div[@id='content']//span[contains(@class, 'entry-date')] +date: //div[contains(@class, 'entry-meta')]//time[@pubdate or @pubDate] +author: //div[contains(@class, 'entry-meta')]//a[@rel='author'] + +prune: no + +strip: //nav +strip: //header +strip: //*[@id='comments' or @id='respond'] +strip: //div[contains(@class, 'comments')] +strip_id_or_class: sharedaddy +strip_id_or_class: wpadvert +strip_id_or_class: commentlist +strip_id_or_class: sociable +strip_id_or_class: related_post +strip_id_or_class: wp-socializer +strip_id_or_class: addtoany +strip: //div[contains(concat(' ',normalize-space(@class),' '),' navigation ')] +#strip: //iframe + +test_url: https://elisehahn.wordpress.com/2013/09/22/looking-back-to-move-forward-navigating-race-and-racism-in-neoliberal-terrain/ diff --git a/data/GrabberConfig/20min.ch.txt b/data/GrabberConfig/20min.ch.txt new file mode 100644 index 00000000..cd8e3fc0 --- /dev/null +++ b/data/GrabberConfig/20min.ch.txt @@ -0,0 +1,24 @@ +# Author: cirnod@gmail.com + +tidy: no +prune: no + +title: //h1 +date: /html/body/div[3]/div[1]/div[6]/div/div[1]/div[2]/div[1]/div/p +body: //div[@class='published clearfix'] | //div[@class='story_titles']/h3 | //div[@class='story_text'] + +# General Cleanup +strip_id_or_class: info_panel +strip_id_or_class: info_poll +strip_id_or_class: teaser +strip_id_or_class: panelbox +strip_id_or_class: polls +strip_id_or_class: warning +strip_id_or_class: vplaceholder + +# visual removal only -> complete removal doesn't work +replace_string(Print</a>): </a> + +# Try yourself +test_url: http://www.20min.ch/wissen/news/story/31588952 +test_url: http://www.20min.ch/digital/dossier/apple/story/So-einfach-laesst-sich-das-iPhone-6-Plus-verbiegen-24651169 diff --git a/data/GrabberConfig/24.ae.txt b/data/GrabberConfig/24.ae.txt new file mode 100644 index 00000000..6e515076 --- /dev/null +++ b/data/GrabberConfig/24.ae.txt @@ -0,0 +1,8 @@ +title: //div[@id='DivTitle'] +body: //div[@id='divImages' or @id='Divkhabarcontent'] +author: //div[@id='DivAuthor'] + +prune: no + +test_url: http://24.ae/article.aspx?ArticleId=123304 +test_url: http://24.ae/rss.aspx?pageId=30 diff --git a/data/GrabberConfig/24ways.org.txt b/data/GrabberConfig/24ways.org.txt new file mode 100644 index 00000000..94819b57 --- /dev/null +++ b/data/GrabberConfig/24ways.org.txt @@ -0,0 +1,6 @@ +title: //div[@class='meta']/h2/a +author: //header[@class="c-article__header"]//p[contains(concat(' ',normalize-space(@class),' '),' p-author ')] +date: //time//@datetime +body: //div[@id='article'] +strip: //div[@class='domore'] +test_url: http://24ways.org/2011/composing-the-new-canon diff --git a/data/GrabberConfig/36kr.com.txt b/data/GrabberConfig/36kr.com.txt new file mode 100644 index 00000000..d73d7de5 --- /dev/null +++ b/data/GrabberConfig/36kr.com.txt @@ -0,0 +1,8 @@ +title: //h1[contains(@class, 'entry-title')] +date: //meta[@name='weibo: article:create_at']/@content +body: //div[contains(@class, 'mainContent')] +strip_id_or_class: related_topics + +prune: no + +test_url: http://www.36kr.com/p/207879.html
\ No newline at end of file diff --git a/data/GrabberConfig/37signals.com.txt b/data/GrabberConfig/37signals.com.txt new file mode 100644 index 00000000..531cac1e --- /dev/null +++ b/data/GrabberConfig/37signals.com.txt @@ -0,0 +1,6 @@ +title: //div[@class='post_header']//h2/a +author: //span[@class='author'] +date: //span[@class='date'] +body: //div[@id='Content'] + +test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department
\ No newline at end of file diff --git a/data/GrabberConfig/3quarksdaily.com.txt b/data/GrabberConfig/3quarksdaily.com.txt new file mode 100644 index 00000000..80a3958f --- /dev/null +++ b/data/GrabberConfig/3quarksdaily.com.txt @@ -0,0 +1,9 @@ +body: //div[@class='content'] +date: //div[@class='content']/h2 +strip: //div[@class='content']/h2 +title: //div[@class='content']/h3 + +strip: //div[@id='postmenu'] +strip: //div[@class='trackback'] +tidy: no +test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html
\ No newline at end of file diff --git a/data/GrabberConfig/3voor12.vpro.nl.txt b/data/GrabberConfig/3voor12.vpro.nl.txt new file mode 100644 index 00000000..b846b050 --- /dev/null +++ b/data/GrabberConfig/3voor12.vpro.nl.txt @@ -0,0 +1,11 @@ +body: //div[@id='main'] +title: //div[@class='intro']/h1 +author: //ul[@class='text-data']/li[@class='author'] +date: //ul[@class='text-data']/li[@class='date'] +convert_double_br_tags: yes +tidy: no + +strip: //div[@class='share'] +strip: //*[@class='zoom'] +strip: //div[@id='disqus_thread'] +test_url: http://3voor12.vpro.nl/nieuws/2012/januari/Ook-website-GroenLinks-woensdag-op-zwart-i-v-m--SOPA.html
\ No newline at end of file diff --git a/data/GrabberConfig/43folders.com.txt b/data/GrabberConfig/43folders.com.txt new file mode 100644 index 00000000..3777c66f --- /dev/null +++ b/data/GrabberConfig/43folders.com.txt @@ -0,0 +1,4 @@ +body: //*[@class = 'content'] +author: //*[@class = 'submitted']/a +date: substring-after(//*[@class = 'submitted']/text(), '|') +test_url: http://www.43folders.com/2011/04/22/cranking
\ No newline at end of file diff --git a/data/GrabberConfig/500px.com.txt b/data/GrabberConfig/500px.com.txt new file mode 100644 index 00000000..b9b7e9dd --- /dev/null +++ b/data/GrabberConfig/500px.com.txt @@ -0,0 +1,27 @@ +# very loose setup for both 500px.com/photo/* and 500px.com/blog/* +# photo page example: http://500px.com/photo/4181666 +# blog page example: http://500px.com/blog/110 + +# avoid "no text" error +tidy:no +prune:no + +# reorganize photo page elements +#body://div[contains(@class,'container')] +move_into(body)://div[contains(@id,'thephoto')] +move_into(body)://div[contains(@id,'description')] +move_into(body)://div[contains(@id,'tags')] +move_into(body)://div[contains(@id,'photo-info')] + +# clean photo page info +strip://span[contains(@id,'copyright')] +strip://*[contains(@id,'store')] +strip://*[contains(@id,'user-info')] +strip://*[contains(@id,'photo-stats')] +strip://*[contains(@id,'voting_controls_container')] +strip://*[contains(@id,'more-photos')] +strip://*[contains(@id,'embed-photo')] + +# clean blog page side bar +strip://*[contains(@class,'col d3 clearafter')] +test_url: http://500px.com/photo/3641041?from=editors
\ No newline at end of file diff --git a/data/GrabberConfig/512pixels.net.txt b/data/GrabberConfig/512pixels.net.txt new file mode 100644 index 00000000..02a996f7 --- /dev/null +++ b/data/GrabberConfig/512pixels.net.txt @@ -0,0 +1,2 @@ +title: //meta[@property='og:title']/@content +test_url: http://www.512pixels.net/blog/2014/10/the-move diff --git a/data/GrabberConfig/5by5.tv.txt b/data/GrabberConfig/5by5.tv.txt new file mode 100644 index 00000000..59b70a99 --- /dev/null +++ b/data/GrabberConfig/5by5.tv.txt @@ -0,0 +1,9 @@ +body: //*[@id="episode"] +prune: no +tidy: no + +autodetect_next_page: no +strip_id_or_class: player + +strip://*[@id="header"] +test_url: http://5by5.tv/buildanalyze/60
\ No newline at end of file diff --git a/data/GrabberConfig/7newsbelize.com.txt b/data/GrabberConfig/7newsbelize.com.txt new file mode 100644 index 00000000..46d09f8e --- /dev/null +++ b/data/GrabberConfig/7newsbelize.com.txt @@ -0,0 +1,7 @@ +title: //*[@id='sstitle'] +body: //div[@id='sstory'] +strip_id_or_class: newsoptions +prune: no + +test_url: http://www.7newsbelize.com/sstory.php?nid=25654 +test_url: http://www.7newsbelize.com/7news.xml
\ No newline at end of file diff --git a/data/GrabberConfig/8e-etage.fr.txt b/data/GrabberConfig/8e-etage.fr.txt new file mode 100644 index 00000000..adfc2355 --- /dev/null +++ b/data/GrabberConfig/8e-etage.fr.txt @@ -0,0 +1,21 @@ + +test_url: https://8e-etage.fr/2018/04/13/podcast-lhistoire-des-mapuches-a-travers-les-siecles/ + +strip_id_or_class: subscribe-block +strip_id_or_class: mashsb-container +strip_id_or_class: entry-author-box +strip_id_or_class: it-exchange-membership-restricted-content + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' subscribe-block ')] +login_uri: https://8e-etage.fr/ +login_username_field: log +login_password_field: pwd +login_extra_fields: rd_login=true +login_extra_fields: rememberme=on + +test_url: https://8e-etage.fr/2018/07/02/la-france-championne-du-pessimisme/ +test_url: https://8e-etage.fr/2018/05/28/au-bresil-les-enfants-precheurs-crevent-lecran/ diff --git a/data/GrabberConfig/9gag.com.txt b/data/GrabberConfig/9gag.com.txt new file mode 100644 index 00000000..4ebb62ad --- /dev/null +++ b/data/GrabberConfig/9gag.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2F9gag.com%2Fgag%2FaDwQnO7 + +body: //div[contains(concat(' ',normalize-space(@class),' '),' badge-post-container ')] +test_url: http://9gag.com/gag/aDwQnO7 diff --git a/data/GrabberConfig/9to5mac.com.txt b/data/GrabberConfig/9to5mac.com.txt new file mode 100644 index 00000000..c554a65b --- /dev/null +++ b/data/GrabberConfig/9to5mac.com.txt @@ -0,0 +1,4 @@ +strip: //p[preceding::hr]/span[@class="embed-youtube"] +strip: //hr + +test_url: https://9to5mac.com/2017/04/14/toshiba-semiconductor-business-apple-foxconn/ diff --git a/data/GrabberConfig/LICENSE.txt b/data/GrabberConfig/LICENSE.txt new file mode 100644 index 00000000..e078f68f --- /dev/null +++ b/data/GrabberConfig/LICENSE.txt @@ -0,0 +1,3 @@ +These files are released to the public domain. + +See https://creativecommons.org/publicdomain/zero/1.0/ for more information. diff --git a/data/GrabberConfig/README.md b/data/GrabberConfig/README.md new file mode 100644 index 00000000..ab5b12d9 --- /dev/null +++ b/data/GrabberConfig/README.md @@ -0,0 +1,40 @@ +Full-Text RSS site config files +================ + +[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules are found, it tries to detect the content block automatically. + +This repository contains the site-specific extraction rules we rely on in Full-Text RSS. + +### Contributing changes + +We run automated tests on these files to detect issues. If you'd like to help keep these up to date, please look at the [test results](http://siteconfig.fivefilters.org/test/) and see which files you'd like to contribute fixes for. + +We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface. + +You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model: + +> The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination. + +When we receive a pull request we'll review the changes and if everything's okay we'll update our copy. + +If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github). + +### How to write a site config file + +The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block. + +For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns). + +### Instapaper + +When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users. + +Marco, Instapaper's creator, graciously opened up the database of contributions to everyone: + +> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached. + +Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (no longer available since Instapaper was sold). + +### Testing site config files + +Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier. diff --git a/data/GrabberConfig/aachener-nachrichten.de.txt b/data/GrabberConfig/aachener-nachrichten.de.txt new file mode 100644 index 00000000..b60c15de --- /dev/null +++ b/data/GrabberConfig/aachener-nachrichten.de.txt @@ -0,0 +1,10 @@ +title: //meta[@property='og:title']/@content +body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] + +strip_id_or_class: socialshareprivacy1 +strip_id_or_class: zvaFacebookButton + +tidy: no +prune: no + +test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757
\ No newline at end of file diff --git a/data/GrabberConfig/abc-luxe.com.txt b/data/GrabberConfig/abc-luxe.com.txt new file mode 100644 index 00000000..7c8f859f --- /dev/null +++ b/data/GrabberConfig/abc-luxe.com.txt @@ -0,0 +1,4 @@ +title: //div[contains(concat(' ',normalize-space(@class),' '),' brandMarginT ')]//h1 +body: //div[contains(concat(' ',normalize-space(@class),' '),' article ')] + +test_url: http://www.abc-luxe.com/actus/produits/article/kenzo-world-une-campagne-dejantee-pour-le-premier-parfum-signe-carol-lim-et-humberto-leon diff --git a/data/GrabberConfig/abc.es.txt b/data/GrabberConfig/abc.es.txt new file mode 100644 index 00000000..43aadc49 --- /dev/null +++ b/data/GrabberConfig/abc.es.txt @@ -0,0 +1,7 @@ +title: //meta[@property='og:title']/@content +body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text' or @itemprop='articleBody'] +strip_id_or_class: colB + +prune: no + +test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html
\ No newline at end of file diff --git a/data/GrabberConfig/abc.net.au.txt b/data/GrabberConfig/abc.net.au.txt new file mode 100644 index 00000000..22b3a0f4 --- /dev/null +++ b/data/GrabberConfig/abc.net.au.txt @@ -0,0 +1,18 @@ +title: //div[@class='article section']//h1 +author: //div[@class="byline"]/a +date: //span[@class="timestamp"] +body: //div[@class="page section"] + +strip: //a[@class="inline-caption"] +strip: //p[@class="ticker section noprint"] +strip: //p[@class="topics"] +strip: //h1 +strip: //div[@class="byline"] +strip: //p[@class="published"] +strip: //div[contains(@class,"featured-scroller")] +strip_id_or_class: footer + +tidy: no + +test_url: http://www.abc.net.au/news/2013-03-27/open-speed-highways-change-clp-giles/4597892 +test_url: http://www.abc.net.au/news/2013-04-30/credit-growth-remains-subdued/4660054?section=business diff --git a/data/GrabberConfig/abcnews.go.com.txt b/data/GrabberConfig/abcnews.go.com.txt new file mode 100644 index 00000000..8d367351 --- /dev/null +++ b/data/GrabberConfig/abcnews.go.com.txt @@ -0,0 +1,27 @@ +title: //h1[@class='headline'] +body: //div[@id='storyText'] +# for video entries +body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')] +author: //div[@class='byline'] +date: //div[@class='date'] +strip: //*[@id='date_partner'] + +strip: //div[@class='breadcrumb'] +strip: //div[contains(@class,'show_tools')] +strip: //div[@id='sponsoredByAd'] +strip: //div[contains(@class,'rel_container')] +strip: //p[a[starts-with(@href, 'http://www.twitter.com')]] +strip: //p[a[starts-with(@href, 'http://www.facebook.com')]] +strip: //p[contains(., 'Click here to return to')] +#strip_id_or_class: media +strip_id_or_class: mediaplayer + +replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http + +prune: no + +single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true') + +test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744 +# multi-page +test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544
\ No newline at end of file diff --git a/data/GrabberConfig/accesstoinsight.org.txt b/data/GrabberConfig/accesstoinsight.org.txt new file mode 100644 index 00000000..45d66533 --- /dev/null +++ b/data/GrabberConfig/accesstoinsight.org.txt @@ -0,0 +1,9 @@ +title: //div[@id='H_docTitle'] + +body: //div[@id='H_meta' or @id='H_content' or @id='F_footer'] + +strip_id_or_class: F_toenail + +prune: no + +test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html
\ No newline at end of file diff --git a/data/GrabberConfig/acidcow.com.txt b/data/GrabberConfig/acidcow.com.txt new file mode 100644 index 00000000..21958651 --- /dev/null +++ b/data/GrabberConfig/acidcow.com.txt @@ -0,0 +1,3 @@ +body: //div[starts-with(@id, 'news-id-')] + +test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html
\ No newline at end of file diff --git a/data/GrabberConfig/aclu.org.txt b/data/GrabberConfig/aclu.org.txt new file mode 100644 index 00000000..74236e2d --- /dev/null +++ b/data/GrabberConfig/aclu.org.txt @@ -0,0 +1,5 @@ +body: //div[@class='panel-panel panel-main-3 content-column'] +title: //div[@class='panel-pane pane-node-title'] +date: //div[@class='updated-date'] + +test_url: https://www.aclu.org/blog/free-future/chinas-nightmarish-citizen-scores-are-warning-americans diff --git a/data/GrabberConfig/acroswing.fr.txt b/data/GrabberConfig/acroswing.fr.txt new file mode 100644 index 00000000..6b1d67fe --- /dev/null +++ b/data/GrabberConfig/acroswing.fr.txt @@ -0,0 +1,5 @@ +tidy:no +date: //time[@class='updated'] +dissolve: //ul[@class='video-gallery']/li +dissolve: //ul[@class='video-gallery'] +test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php
\ No newline at end of file diff --git a/data/GrabberConfig/actualitte.com.txt b/data/GrabberConfig/actualitte.com.txt new file mode 100644 index 00000000..0dba7a6e --- /dev/null +++ b/data/GrabberConfig/actualitte.com.txt @@ -0,0 +1,4 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' article ')] +author: //p[@class="auteur"]/a + +test_url: https://www.actualitte.com/article/monde-edition/jean-miguel-pire-n-est-plus-le-conseiller-culture-livre-et-lecture-de-nyssen/84556 diff --git a/data/GrabberConfig/ad.nl.txt b/data/GrabberConfig/ad.nl.txt new file mode 100644 index 00000000..41ca08cf --- /dev/null +++ b/data/GrabberConfig/ad.nl.txt @@ -0,0 +1,7 @@ +#bypass cookie check +single_page_link: //a[contains(@href, '/accept?url=')] + +test_url: http://www.ad.nl/ad/nl/10444/Offside/article/detail/4043834/2015/05/31/Dani-Alves-voetbalt-met-drol-op-zijn-hoofd.dhtml +test_contains: De nieuwe coupe van Alves + +test_url: http://www.ad.nl/digitaal/rss.xml diff --git a/data/GrabberConfig/adme.ru.txt b/data/GrabberConfig/adme.ru.txt new file mode 100644 index 00000000..b929685d --- /dev/null +++ b/data/GrabberConfig/adme.ru.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.adme.ru%2Ftvorchestvo-hudozhniki%2Fprostoj-kak-5-kopeek-hudozhnik-557405%2F + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article ')] +test_url: http://www.adme.ru/tvorchestvo-hudozhniki/prostoj-kak-5-kopeek-hudozhnik-557405/ diff --git a/data/GrabberConfig/admin-magazin.de.txt b/data/GrabberConfig/admin-magazin.de.txt new file mode 100644 index 00000000..c16207eb --- /dev/null +++ b/data/GrabberConfig/admin-magazin.de.txt @@ -0,0 +1,60 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.admin-magazin.de%2FDas-Heft%2F2014%2F02%2FWorkshop-OpenLDAP-Server + +tidy: no +prune: no + +#################################################### +# ADMIN Das-Heft +#################################################### + +# Set article informations +title: //h1 +author: //div[contains(concat(' ',normalize-space(@class),' '),' author ')] + +# Content +body: //div[@id='ContentLeft']/div[contains(@class, 'full-article_print')] + + +# Fetch full multipage articles +next_page_link: //div[@class="pagenavigator"][1]/p/span[@class="next"]/a + +# Cleanup +strip_id_or_class: attribute-relatedcontent +strip_id_or_class: heftreferenz +strip_id_or_class: disqus_thread +strip_id_or_class: pagenavigator + +test_url: http://www.admin-magazin.de/Das-Heft/2014/02/Workshop-OpenLDAP-Server +test_contains: OpenLDAP hat in der Version 2.4 +test_url: http://www.admin-magazin.de/Das-Heft/2017/07/Sophos-Mobile-7 +test_url: http://www.admin-magazin.de/Das-Heft/2013/11/Hochverfuegbarkeit-mit-SQL-Server-2012-und-2014 +test_contains: Lokale Hochverfügbarkeit kommt in diesem Fall durch redundante Server-Instanzen zustande +test_url: http://www.admin-magazin.de/Online-Artikel/Postfix-einrichten-und-absichern +test_contains: Die eben beschriebenen Maßnahmen reduzieren zwar den Ansturm von unerwünschter Mail + +################################################### +# ADMIN News +################################################### + +# Set article informations +title: //h1 +date: //div[contains(@class, 'view-publish_date')] + +# Content +body: //div[@id='ContentLeft']/div[contains(@class, 'full-news')] + +# Cleanup +strip_id_or_class: adlib_wrapper +strip_id_or_class: addthis_toolbox +strip_id_or_class: tags +strip_id_or_class: attribute-relatedcontent +strip_id_or_class: disqus_thread + +test_url: http://www.admin-magazin.de/News/Programm-fuer-LinuxCon-Europe-steht/(tagID)/13 +test_contains: Die Linux Foundation hat +test_url: http://www.admin-magazin.de/News/Release-Kandidat-von-HAProxy-1.8-unterstuetzt-HTTP-2 +test_contains: Mit Version 1.8 unterstützt HAProxy netzwerkseitig +test_url: http://www.admin-magazin.de/Online-Artikel/Docker-Workshop-2-Volumes +test_contains: Wenn ein Verzeichnis, das Sie auf dem Host als Docker-Volume nutzen wollen diff --git a/data/GrabberConfig/adslzone.net.txt b/data/GrabberConfig/adslzone.net.txt new file mode 100644 index 00000000..00d69de0 --- /dev/null +++ b/data/GrabberConfig/adslzone.net.txt @@ -0,0 +1,8 @@ +title: //h1[@class='entry-title'] +body: //article + +strip: //div[@class='io-word-count'] +strip: //ol[@class='breadcrumb'] +strip: //aside + +test_url: https://www.adslzone.net/2017/12/19/xiaomi-sneaky-santa-17/ diff --git a/data/GrabberConfig/aei.org.txt b/data/GrabberConfig/aei.org.txt new file mode 100644 index 00000000..7cea6cae --- /dev/null +++ b/data/GrabberConfig/aei.org.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.aei.org%2Fspotlight%2Fthe-bell-curve-explained%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' textdiv ')] +test_url: https://www.aei.org/spotlight/the-bell-curve-explained/
\ No newline at end of file diff --git a/data/GrabberConfig/africaintelligence.fr.txt b/data/GrabberConfig/africaintelligence.fr.txt new file mode 100644 index 00000000..68e21776 --- /dev/null +++ b/data/GrabberConfig/africaintelligence.fr.txt @@ -0,0 +1,26 @@ + +# Any modifications done here should be duplicated in +# - lalettrea.fr.txt +# - intelligenceonline.fr.txt +# as they seems to use the exact same CMS software as africaintelligence.fr + +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-chapo ')] | //div[contains(concat(' ',normalize-space(@class),' '),' article-body ')] + +prune: no + +strip_id_or_class: sidenav +strip_id_or_class: sidenav-content +strip_id_or_class: article-copyright + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //form[contains(concat(' ',normalize-space(@class),' '),' form-login ')] +login_uri: https://www.africaintelligence.fr/ajax/login/login +login_username_field: Identifiant +login_password_field: MotDePasse +login_extra_fields: ConnexionAuto=on +login_extra_fields: OrigineLogin=Landing + +test_url: https://www.africaintelligence.fr/mc-/reseaux-d-affaires/2018/07/19/la-lia-recupere-ses-tresors-parisiens-des-mains-d-al-kharafi,108317644-gra diff --git a/data/GrabberConfig/aftenposten.no.txt b/data/GrabberConfig/aftenposten.no.txt new file mode 100644 index 00000000..8a69c357 --- /dev/null +++ b/data/GrabberConfig/aftenposten.no.txt @@ -0,0 +1,5 @@ +title: //h1[@class='articleTitle '] +body: //div[@class='bodyText widget storyContent'] +strip: //p/span[@class='quote']/.. +strip_id_or_class: 'pull1' +test_url: https://www.aftenposten.no/meninger/spaltister/Portrett-av-scenekunstneren-som-ung-mann-7167959.html
\ No newline at end of file diff --git a/data/GrabberConfig/aftonbladet.se.txt b/data/GrabberConfig/aftonbladet.se.txt new file mode 100644 index 00000000..b6c576a8 --- /dev/null +++ b/data/GrabberConfig/aftonbladet.se.txt @@ -0,0 +1,13 @@ +author: //article//address[contains(@class, 'author')] +body: //article[.//div[contains(@class, 'abBodyText')]]//*[contains(@class, 'abLeadText') or contains(@class, 'abBodyText') or contains(@class, 'abImageBlock') or contains(@class, 'abIGSatellite')] + +strip: //address//img +strip: //footer +strip_id_or_class: abSticky + +prune: no + +test_url: http://www.aftonbladet.se/sportbladet/hockey/sverige/allsvenskan/article17498194.ab +test_url: http://www.aftonbladet.se/debatt/article16207536.ab +test_url: http://www.aftonbladet.se/debatt/debattamnen/politik/article17483377.ab +test_url: http://www.aftonbladet.se/rss.xml
\ No newline at end of file diff --git a/data/GrabberConfig/aht.seriouseats.com.txt b/data/GrabberConfig/aht.seriouseats.com.txt new file mode 100644 index 00000000..b2d88a05 --- /dev/null +++ b/data/GrabberConfig/aht.seriouseats.com.txt @@ -0,0 +1,15 @@ +body: //div[@id='content'] + +# clean up recipe pages +strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] + +#recipe pages +strip_id_or_class: "recipe-feedback" +strip_id_or_class: "comments" +strip_id_or_class: "procedure-number" +strip_id_or_class: "more-with-author" + +#slice +strip_id_or_class: "inner" + +test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html
\ No newline at end of file diff --git a/data/GrabberConfig/aitnews.com.txt b/data/GrabberConfig/aitnews.com.txt new file mode 100644 index 00000000..1b944053 --- /dev/null +++ b/data/GrabberConfig/aitnews.com.txt @@ -0,0 +1,3 @@ +body: //div[contains(@class, 'single-post-thumbnail')] | //*[@itemprop="articleBody"] + +test_url: http://aitnews.com/2016/04/23/%D8%A7%D9%84%D9%80-fbi-%D9%8A%D8%AE%D8%AA%D8%B1%D9%82-%D9%87%D8%A7%D8%AA%D9%81-%D8%A2%D9%8A%D9%81%D9%88%D9%86-%D8%A2%D8%AE%D8%B1-%D8%AF%D9%88%D9%86-%D9%85%D8%B3%D8%A7%D8%B9%D8%AF%D8%A9-%D8%A2%D8%A8/ diff --git a/data/GrabberConfig/albayan.ae.txt b/data/GrabberConfig/albayan.ae.txt new file mode 100644 index 00000000..d52700b3 --- /dev/null +++ b/data/GrabberConfig/albayan.ae.txt @@ -0,0 +1,8 @@ +body: //div[@id='main-column']//div[@class='content'] + +strip_id_or_class: social-buttons + +prune: no + +test_url: http://www.albayan.ae/across-the-uae/education/2013-08-29-1.1949645 +test_url: http://www.albayan.ae/1.448?ot=ot.AjaxPageLayout
\ No newline at end of file diff --git a/data/GrabberConfig/alex.mullr.net.txt b/data/GrabberConfig/alex.mullr.net.txt new file mode 100644 index 00000000..c5f15370 --- /dev/null +++ b/data/GrabberConfig/alex.mullr.net.txt @@ -0,0 +1,2 @@ +body: //div[@class="entry"] +test_url: http://alex.mullr.net/blog/2011/05/on-spotify/
\ No newline at end of file diff --git a/data/GrabberConfig/alexduner.com.txt b/data/GrabberConfig/alexduner.com.txt new file mode 100644 index 00000000..3897f9ec --- /dev/null +++ b/data/GrabberConfig/alexduner.com.txt @@ -0,0 +1,4 @@ +body: //section[@class='content'] +date: //span[1] +author: //h1[@id='sitetitle'] +test_url: http://alexduner.com/blog/something-i-learned-today diff --git a/data/GrabberConfig/alimentation-generale.fr.txt b/data/GrabberConfig/alimentation-generale.fr.txt new file mode 100644 index 00000000..d258d0f9 --- /dev/null +++ b/data/GrabberConfig/alimentation-generale.fr.txt @@ -0,0 +1,12 @@ + +body: //div[@id='arty'] + +author: //div[contains(concat(' ',normalize-space(@class),' '),' entete ')]//a[starts-with(@href,'https://alimentation-generale.fr/author/')] + +strip_id_or_class: entete +strip_id_or_class: singlenav +strip_id_or_class: blokpartage +strip_id_or_class: encontinuwrapper +strip_id_or_class: blokarticles_lies + +test_url: https://alimentation-generale.fr/reportage/la-ferme-du-bonheur-une-oasis-au-milieu-du-bitume/ diff --git a/data/GrabberConfig/alistapart.com.txt b/data/GrabberConfig/alistapart.com.txt new file mode 100644 index 00000000..bbc09945 --- /dev/null +++ b/data/GrabberConfig/alistapart.com.txt @@ -0,0 +1,12 @@ +title: //h1[@class='entry-title'] +author: //h2/a[@class='fn'] +date: //time[@itemprop='datePublished'] + +body: //div[@itemprop='articleBody'] +strip: //aside +strip: //div[@class="utility-side-bar"] +strip_id_or_class: 'aside-breaker' + +prune: no +test_url: http://www.alistapart.com/articles/organizing-mobile/ +test_url: https://alistapart.com/article/tags-to-dom diff --git a/data/GrabberConfig/aljazeera.com.txt b/data/GrabberConfig/aljazeera.com.txt new file mode 100644 index 00000000..d3bf4014 --- /dev/null +++ b/data/GrabberConfig/aljazeera.com.txt @@ -0,0 +1,8 @@ +title: //span[@id='DetailedTitle'] +body: //td[@id='tdTextContent'] +strip_id_or_class: Skyscrapper_Body +date: //span[@id='ctl00_cphBody_lblDate'] +author: //div[@id="dvAuthorInfo"]//a/text() +strip: //table[ tbody/tr/td/object ] +prune: no +test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html
\ No newline at end of file diff --git a/data/GrabberConfig/allafrica.com.txt b/data/GrabberConfig/allafrica.com.txt new file mode 100644 index 00000000..ca20666b --- /dev/null +++ b/data/GrabberConfig/allafrica.com.txt @@ -0,0 +1,3 @@ +http_header(user-agent): PHP/7.0 + +test_url: http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf diff --git a/data/GrabberConfig/allrecipes.com.txt b/data/GrabberConfig/allrecipes.com.txt new file mode 100644 index 00000000..85dc2a5a --- /dev/null +++ b/data/GrabberConfig/allrecipes.com.txt @@ -0,0 +1,14 @@ +title: //h1[@id='itemTitle'] +body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')] +strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right'] +strip: //div[contains(@class, 'rightcoltoolsdiv')] +strip: //div[contains(@class, 'servings-form')] +strip: //p[@class='nutritional-information'] +strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')] +strip: //div[@id='nutri-info']/div[contains(@class, 'title')] +strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter'] +strip_id_or_class: eshaAttribute +strip_id_or_class: eshaParagraph +prune: no + +test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd
\ No newline at end of file diff --git a/data/GrabberConfig/allthingsd.com.txt b/data/GrabberConfig/allthingsd.com.txt new file mode 100644 index 00000000..f8c67d02 --- /dev/null +++ b/data/GrabberConfig/allthingsd.com.txt @@ -0,0 +1,13 @@ +title://div[@class="article-title"]/h1[@class="title"] +date: //p[@class="article-date"] +body://div[contains(@class, "article-body")] +# Trim out related posts at bottom of article +strip://blockquote[@class="memo"] + +tidy: no + +# Yup, no idea why author won't work... +author://div[@class="page-header article-header clearfix"]/p[@class="title"] +# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it. +test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/ +test_url: http://allthingsd.com/20131010/google-cio-ben-fried-on-how-google-works/
\ No newline at end of file diff --git a/data/GrabberConfig/allyou.com.txt b/data/GrabberConfig/allyou.com.txt new file mode 100644 index 00000000..a13a7252 --- /dev/null +++ b/data/GrabberConfig/allyou.com.txt @@ -0,0 +1,8 @@ +title: //div[@id='pageHdr']//h1 +body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint'] +strip: //div[contains(@class, 'infoBox') or @id='infoBox'] +single_page_link: //li[@id='print']/a + +prune: no + +test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/
\ No newline at end of file diff --git a/data/GrabberConfig/alphabeta.argaam.com.txt b/data/GrabberConfig/alphabeta.argaam.com.txt new file mode 100644 index 00000000..da1a67bc --- /dev/null +++ b/data/GrabberConfig/alphabeta.argaam.com.txt @@ -0,0 +1,11 @@ +body: //div[@class = 'entry'] +date: substring-after(//p[@class="date"],'بتاريخ ') +strip_id_or_class: date +strip_id_or_class: follow-single +strip_id_or_class: ratingblock +strip_id_or_class: newRatingHolder +strip_id_or_class: postmetadata +strip_id_or_class: addthis_toolbox +strip_id_or_class: addthis_default_style +strip_id_or_class: size-full +test_url: http://alphabeta.argaam.com/?p=35657
\ No newline at end of file diff --git a/data/GrabberConfig/alriyadh.com.txt b/data/GrabberConfig/alriyadh.com.txt new file mode 100644 index 00000000..be7c43d5 --- /dev/null +++ b/data/GrabberConfig/alriyadh.com.txt @@ -0,0 +1,9 @@ +body: //div[@id = "article-view"] +body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')] +author: //p[@class = "author"] +strip: //h1 +strip: //h2 +strip_id_or_class: author +prune: no +test_url: http://www.alriyadh.com/2011/10/10/article674357.html +test_url: http://www.alriyadh.com/net/article/780935
\ No newline at end of file diff --git a/data/GrabberConfig/alsacreations.com.txt b/data/GrabberConfig/alsacreations.com.txt new file mode 100644 index 00000000..300ca00a --- /dev/null +++ b/data/GrabberConfig/alsacreations.com.txt @@ -0,0 +1,4 @@ +author: //div[@class="auteur-meta"]//a[contains(@href, '/profil/')] +date: //div[@class="auteur-meta"]//time/@datetime + +test_url: https://www.alsacreations.com/tuto/lire/1771-css-grid-layout-en-production.html diff --git a/data/GrabberConfig/alseraj.net.txt b/data/GrabberConfig/alseraj.net.txt new file mode 100644 index 00000000..107d82d6 --- /dev/null +++ b/data/GrabberConfig/alseraj.net.txt @@ -0,0 +1,2 @@ +title: //*[@id='normalfontyellow'] +test_url: http://www.alseraj.net/cgi-bin/pros/av/LeqaTextDisplay.cgi?display&2
\ No newline at end of file diff --git a/data/GrabberConfig/alternatives-economiques.fr.txt b/data/GrabberConfig/alternatives-economiques.fr.txt new file mode 100644 index 00000000..de00e7d0 --- /dev/null +++ b/data/GrabberConfig/alternatives-economiques.fr.txt @@ -0,0 +1,20 @@ +# Publication date +date: //header[contains(concat(' ',normalize-space(@class),' '),' o-page__content__head ')]//div[contains(concat(' ',normalize-space(@class),' '),' o-infos ')]//time[contains(concat(' ',normalize-space(@class),' '),' o-infos__date ')] + +# Author +author: //header[contains(concat(' ',normalize-space(@class),' '),' o-page__content__head ')]//div[contains(concat(' ',normalize-space(@class),' '),' o-infos ')]//a[contains(concat(' ',normalize-space(@class),' '),' o-infos__author ')] + +# Remove quotes and others +strip: //div[contains(concat(' ',normalize-space(@class),' '),' o-page__footer ')] +strip_id_or_class: c-same-subject +strip_id_or_class: c-kiosk--single +strip_id_or_class: o-page__footer +strip_id_or_class: o-page__figure__trigger +strip_id_or_class: c-comments +strip_id_or_class: c-epigraph + +# Strip optional [removed because clearer like this +#strip_id_or_class: o-page__content__head + +# Test URL +test_url: https://www.alternatives-economiques.fr/etats-unis-overdoses-tuent-plus-armes-a-feu/00085167 diff --git a/data/GrabberConfig/alternet.org.txt b/data/GrabberConfig/alternet.org.txt new file mode 100644 index 00000000..e92252eb --- /dev/null +++ b/data/GrabberConfig/alternet.org.txt @@ -0,0 +1,4 @@ +single_page_link: //div[contains(@class, 'story_tools')]//a[contains(@href, '/print/')] + +test_url: http://www.alternet.org/civil-liberties/noam-chomsky-surveillance-state-beyond-imagination-being-created-one-freest +test_url: http://feeds.feedblitz.com/alternet
\ No newline at end of file diff --git a/data/GrabberConfig/altfoto.com.txt b/data/GrabberConfig/altfoto.com.txt new file mode 100644 index 00000000..d974cf4a --- /dev/null +++ b/data/GrabberConfig/altfoto.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://altfoto.com/2011/09/nikon-presenta-su-nuevo-sistema-nikon-1-y-dos-nuevas-camaras
\ No newline at end of file diff --git a/data/GrabberConfig/alumni.stanford.edu.txt b/data/GrabberConfig/alumni.stanford.edu.txt new file mode 100644 index 00000000..a5bd03bf --- /dev/null +++ b/data/GrabberConfig/alumni.stanford.edu.txt @@ -0,0 +1,10 @@ +title: //h1 + +author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ") + +date: //div/a[contains (@href, "issue")] + +move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1] + +body: //div[@class="enableBullets"] +test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819
\ No newline at end of file diff --git a/data/GrabberConfig/amandala.com.bz.txt b/data/GrabberConfig/amandala.com.bz.txt new file mode 100644 index 00000000..fb0e21b8 --- /dev/null +++ b/data/GrabberConfig/amandala.com.bz.txt @@ -0,0 +1,6 @@ +body: //div[@id='content']//div[contains(@class, 'content')] +strip_id_or_class: widget +strip: //a[contains(@href, 'upm_export=')] + +test_url: http://amandala.com.bz/news/feed/ +test_url: http://amandala.com.bz/news/poor-pse-results-30-raise/
\ No newline at end of file diff --git a/data/GrabberConfig/amazon.com.txt b/data/GrabberConfig/amazon.com.txt new file mode 100644 index 00000000..c1d8d156 --- /dev/null +++ b/data/GrabberConfig/amazon.com.txt @@ -0,0 +1,21 @@ +title: //span[@id = 'btAsinTitle'] +body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div +#strip_id_or_class: quantityDropdownDiv +#strip_id_or_class: addToCartSpan +#strip_id_or_class: oneClickDiv +strip_id_or_class: nocontent +strip_id_or_class: masDynamicConten +strip_id_or_class: dynamic-content +prune: no + +find_string: <span id="actualPriceValue"> +replace_string: <span id="actualPriceValue"><br />Price: + +strip_id_or_class: collapsePS +strip_id_or_class: expandPS +strip_id_or_class: psPlaceHolde +strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')] + +http_header(user-agent): PHP/5.3 + +test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/ diff --git a/data/GrabberConfig/americandrink.net.txt b/data/GrabberConfig/americandrink.net.txt new file mode 100644 index 00000000..7145f3ff --- /dev/null +++ b/data/GrabberConfig/americandrink.net.txt @@ -0,0 +1,6 @@ +title: //div[@class='head']/h2/a +author: //div[@class='head']/a +date: //div[@class='head']/p[@class='date']/a +body: //div[@class='copy'] +strip: //p[@class='meta'] +test_url: http://americandrink.net/post/10567188712/free-the-hooch
\ No newline at end of file diff --git a/data/GrabberConfig/americastestkitchenfeed.com.txt b/data/GrabberConfig/americastestkitchenfeed.com.txt new file mode 100644 index 00000000..c2b62b5a --- /dev/null +++ b/data/GrabberConfig/americastestkitchenfeed.com.txt @@ -0,0 +1,5 @@ +title: //h1[@class="post-title"] +author: //span[@class="author"]/a +date: //span[@class="date"] +body: //div[@class="post-content main"] +test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/
\ No newline at end of file diff --git a/data/GrabberConfig/amptoons.com.txt b/data/GrabberConfig/amptoons.com.txt new file mode 100644 index 00000000..d3fa8fa1 --- /dev/null +++ b/data/GrabberConfig/amptoons.com.txt @@ -0,0 +1,8 @@ +title: //title + +body: //div[@class="entry-content"] + +author: //span[@class="author vcard"] + +date: //span[@class="entry-date"] +test_url: http://amptoons.com/blog/?p=22736 diff --git a/data/GrabberConfig/anandtech.com.txt b/data/GrabberConfig/anandtech.com.txt new file mode 100644 index 00000000..faba9fb8 --- /dev/null +++ b/data/GrabberConfig/anandtech.com.txt @@ -0,0 +1,16 @@ +body: //section[@class='main_cont']/img | //div[@class='articleContent'] +title: //div[@class='blog_top_left']//h2 +author: //a[@class='b'][1] +date: substring-after(substring-before(//div, 'Posted in'), ' on ') +strip_image_src: /content/images/globals/ +strip: //h2[. = 'Page 1']/preceding::p +strip: //h2 + +prune: no + +single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/')) +next_page_link: //div[@class='article_links']/a[@class='fl-rt'] + +test_url: http://www.anandtech.com/show/8370/gigabyte-am1m-s2h-review +test_url: http://www.anandtech.com/show/8402/sandisk-releases-ultra-ii-ssd-the-second-tlc-nand-ssd-in-the-market +test_url: http://www.anandtech.com/show/8400/arms-cortex-m-even-smaller-and-lower-power-cpu-cores diff --git a/data/GrabberConfig/android-developers.googleblog.com.txt b/data/GrabberConfig/android-developers.googleblog.com.txt new file mode 100644 index 00000000..c740a49c --- /dev/null +++ b/data/GrabberConfig/android-developers.googleblog.com.txt @@ -0,0 +1,8 @@ +body: //div[@itemprop="articleBody"] + +find_string: <noscript> +replace_string: <div> +find_string: </noscript> +replace_string: </div> + +test_url: https://android-developers.googleblog.com/2017/08/introducing-android-8-oreo.html
\ No newline at end of file diff --git a/data/GrabberConfig/androidandme.com.txt b/data/GrabberConfig/androidandme.com.txt new file mode 100644 index 00000000..bb48a7cb --- /dev/null +++ b/data/GrabberConfig/androidandme.com.txt @@ -0,0 +1,5 @@ +body: //img[@class='attachment-large wp-post-image'] | //div[@class='entry-content'] +prune: no + +test_url: http://androidandme.com/2015/12/news/google-introduces-trial-run-ads-and-interactive-interstitials/ +test_url: http://androidandme.com/2015/12/applications/top-10-new-android-games-this-week-maestria-mad-aces/
\ No newline at end of file diff --git a/data/GrabberConfig/androidpolice.com.txt b/data/GrabberConfig/androidpolice.com.txt new file mode 100644 index 00000000..3588e588 --- /dev/null +++ b/data/GrabberConfig/androidpolice.com.txt @@ -0,0 +1,10 @@ +body: //div[@class='post_content'] +date: //div[@class='date_day'] | div[@class='date_month'] +strip_id_or_class: author-box +strip_id_or_class: multi-page-post +strip_id_or_class: toc_container +author: //h2[@class='author-box-heading']/a +next_page_link: //link[@rel='next']/@href + +test_url: http://www.androidpolice.com/2014/03/30/music-boss-for-pebble-can-now-control-playback-and-volume-on-chromecast-content-from-your-smartwatch/ +test_url: http://www.androidpolice.com/2015/12/07/32-new-and-notable-android-games-from-the-last-2-weeks-112415-12715/
\ No newline at end of file diff --git a/data/GrabberConfig/annatravelling.wordpress.com.txt b/data/GrabberConfig/annatravelling.wordpress.com.txt new file mode 100644 index 00000000..2d8937f7 --- /dev/null +++ b/data/GrabberConfig/annatravelling.wordpress.com.txt @@ -0,0 +1,9 @@ +title: //h1[@class="title"] + +author: ("Anna Manasova") +# is ignored, unfortunately + +date: //p[@class="date"] + +body: //div[@class="entry"] +test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/
\ No newline at end of file diff --git a/data/GrabberConfig/annouchka.fr.txt b/data/GrabberConfig/annouchka.fr.txt new file mode 100644 index 00000000..c88848ca --- /dev/null +++ b/data/GrabberConfig/annouchka.fr.txt @@ -0,0 +1,8 @@ +strip_id_or_class: code-block +strip_id_or_class: hellocoton_vote + +strip: //a[@class='c3'] +strip: //a[@class='c4'] +strip: //a[@class='c5'] + +test_url: http://www.annouchka.fr/5-conseils-pour-profiter-un-peu-plus-de-ses-enfants/ diff --git a/data/GrabberConfig/antirez.com.txt b/data/GrabberConfig/antirez.com.txt new file mode 100644 index 00000000..f8bef02c --- /dev/null +++ b/data/GrabberConfig/antirez.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fantirez.com%2Fnews%2F104 + +body: //article/pre +test_url: http://antirez.com/news/104 diff --git a/data/GrabberConfig/apotheke-adhoc.de.txt b/data/GrabberConfig/apotheke-adhoc.de.txt new file mode 100644 index 00000000..e1736620 --- /dev/null +++ b/data/GrabberConfig/apotheke-adhoc.de.txt @@ -0,0 +1,23 @@ +# Author: zinnober + +prune: no + +title: substring-before(//div[@id='content']/h1, ',') + +single_page_link: //a[@title='Seite drucken'] + +body: //div[@id='detail-body'] + +replace_string(<span class="description">): <em> +replace_string(<p class="leadtext"><small>): <p class="leadtext"> + +# Fix headlines +replace_string(Patrick Hollstein): +replace_string(APOTHEKE ADHOC): +replace_string(dpa): +replace_string(Katharina Lübke): +replace_string(Julia Pradel): +replace_string(Franziska Gerhardt): + +test_url: https://www.apotheke-adhoc.de/nachrichten/detail/panorama/avie-fuer-den-nachfolger/ + diff --git a/data/GrabberConfig/apple.news.txt b/data/GrabberConfig/apple.news.txt new file mode 100644 index 00000000..7cb13d81 --- /dev/null +++ b/data/GrabberConfig/apple.news.txt @@ -0,0 +1,2 @@ +single_page_link: //p//a[contains(., 'Click here')] +test_url: https://apple.news/AHQREjzH0Ts6iikKhNe6o8w diff --git a/data/GrabberConfig/appleinsider.com.txt b/data/GrabberConfig/appleinsider.com.txt new file mode 100644 index 00000000..5ae1050b --- /dev/null +++ b/data/GrabberConfig/appleinsider.com.txt @@ -0,0 +1,23 @@ +title: //h1[@class="art-head"] + +author: //p[contains(@class, 'byline')]/a +#author: //p[text() = 'By ']/a/text() +#strip: //p[text() = 'By '] + +date: //p[contains(@class, 'date-header')] + +body: //div[@class="article"] +strip_id_or_class: lazy +#strip_id_or_class: minor +strip_id_or_class: multipagefooter +strip_id_or_class: date-header +strip_id_or_class: byline + +find_string: <noscript> +replace_string: <div> +find_string: </noscript> +replace_string: </div> + +test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html +test_url: http://appleinsider.com/articles/13/10/03/goldee-companion-app-for-philips-hue-bulbs-offers-shifting-dynamic-light-scenes +test_url: http://appleinsider.com/appleinsider.rss
\ No newline at end of file diff --git a/data/GrabberConfig/appleweblog.com.txt b/data/GrabberConfig/appleweblog.com.txt new file mode 100644 index 00000000..023c9ccb --- /dev/null +++ b/data/GrabberConfig/appleweblog.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://appleweblog.com/2011/09/encontrada-vulnerabilidad-grave-en-skype-para-ios
\ No newline at end of file diff --git a/data/GrabberConfig/archdaily.com.txt b/data/GrabberConfig/archdaily.com.txt new file mode 100644 index 00000000..0178639e --- /dev/null +++ b/data/GrabberConfig/archdaily.com.txt @@ -0,0 +1,5 @@ +date: //div[@class='post_date'] + +body: //div[@class='post_content'] + +test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up
\ No newline at end of file diff --git a/data/GrabberConfig/archive.pressthink.org.txt b/data/GrabberConfig/archive.pressthink.org.txt new file mode 100644 index 00000000..ab973a39 --- /dev/null +++ b/data/GrabberConfig/archive.pressthink.org.txt @@ -0,0 +1,11 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder + +title: //h3[contains(concat(' ',normalize-space(@class),' '),' title ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' blogbody ')] +date: //div[contains(concat(' ',normalize-space(@class),' '),' date ')] + +strip: //h3[contains(concat(' ',normalize-space(@class),' '),' title ')] +strip: //span[contains(concat(' ',normalize-space(@class),' '),' posted ')] + +test_url: http://archive.pressthink.org/2003/09/08/basics_master.html diff --git a/data/GrabberConfig/archiveofourown.org.txt b/data/GrabberConfig/archiveofourown.org.txt new file mode 100644 index 00000000..8ddc11cd --- /dev/null +++ b/data/GrabberConfig/archiveofourown.org.txt @@ -0,0 +1,26 @@ +# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages. +# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default. +# Exclude: header, footer, navigation, comments. +# Notes: User is a newbie with XPaths. + +body: //div[@id="workskin"] + +prune: no + +title: //h2[@class='title'] +author: //h3[@class='byline'] +author: //a[@class='login author'] + +strip_id_or_class:header +strip_id_or_class:navigation +strip_id_or_class:feedback +strip_id_or_class:kudos +strip_id_or_class:add_comment_placeholder +strip_id_or_class:add_comment +strip_id_or_class:globalize +strip_id_or_class:footer + +single_page_link: //div[@id='main']//a[contains(@href, 'view_adult=true')] + +test_url: https://archiveofourown.org/works/229402?view_full_work=true +test_url: https://archiveofourown.org/works/750111/chapters/1399929 diff --git a/data/GrabberConfig/arduino-tutorial.de.txt b/data/GrabberConfig/arduino-tutorial.de.txt new file mode 100644 index 00000000..ac0c7c05 --- /dev/null +++ b/data/GrabberConfig/arduino-tutorial.de.txt @@ -0,0 +1,4 @@ +// Stripping of Crayon Syntax Highlighter in duplicate +strip: //textarea[contains(@class, 'crayon-plain')] + +test_url: https://www.arduino-tutorial.de/ein-altes-handy-als-arduino-input/ diff --git a/data/GrabberConfig/arretsurimages.net.txt b/data/GrabberConfig/arretsurimages.net.txt new file mode 100644 index 00000000..3ba68881 --- /dev/null +++ b/data/GrabberConfig/arretsurimages.net.txt @@ -0,0 +1,14 @@ +title://div[@id="titrage-contenu"]/h1[@class="title"] +author: //div[@id="titrage-contenu"]//a[starts-with(@href,'/recherche.php?auteur_id=')] +body: //div[@class="contenu-html"]/div[@class="page-pane"] + +# Wallabag-specific login directives (not supported in FTR) +requires_login: yes + +login_uri: http://www.arretsurimages.net/forum/login.php +login_username_field: username +login_password_field: password + +not_logged_in_xpath: //body[@class="not-logged-in"] + +test_url: http://www.arretsurimages.net/chroniques/2017-03-04/Mathilde-Larrere-aux-vraies-origines-du-8-mars-id9619 diff --git a/data/GrabberConfig/arstechnica.co.uk.txt b/data/GrabberConfig/arstechnica.co.uk.txt new file mode 100644 index 00000000..9ec8ff69 --- /dev/null +++ b/data/GrabberConfig/arstechnica.co.uk.txt @@ -0,0 +1,6 @@ +strip: //aside +next_page_link: //nav//a[contains(text(), 'Next')]/@href + +test_url: http://arstechnica.co.uk/science/2016/06/what-is-open-access-free-sharing-of-all-human-knowledge/ +test_url: http://arstechnica.co.uk/information-technology/2016/05/eben-moglen-gpl-online-advertising-is-becoming-a-perfect-despotism/ + diff --git a/data/GrabberConfig/arstechnica.com.txt b/data/GrabberConfig/arstechnica.com.txt new file mode 100644 index 00000000..ffd5cc89 --- /dev/null +++ b/data/GrabberConfig/arstechnica.com.txt @@ -0,0 +1,20 @@ +author: //p[@class='byline']/a +body: //div[contains(@class,'article-content')] +strip: //h2[@class='title'] +strip_id_or_class: byline +strip_id_or_class: story-sidebar +prune: no + +date: //div[@class='byline']/span[@class='posted']//abbr/@original-title +date: //div[@class='byline']/span[@class='posted']//abbr + +title: //div[@id='story']//h2[@class='title'] + +strip: //div[@class='pager'] +native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')] + +strip: //aside +next_page_link: //nav//a[contains(text(), 'Next')]/@href + +test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars +test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/ diff --git a/data/GrabberConfig/articles.courant.com.txt b/data/GrabberConfig/articles.courant.com.txt new file mode 100644 index 00000000..984d81de --- /dev/null +++ b/data/GrabberConfig/articles.courant.com.txt @@ -0,0 +1,11 @@ +title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1 +date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"] +author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3] + +strip_id_or_class: mod-article-byline +strip_id_or_class: mod-article-header +strip_id_or_class: mod-article-subtitle +#This leaves some crud after the article, but it's better than nothing. +#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element. + +test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown
\ No newline at end of file diff --git a/data/GrabberConfig/articles.washingtonpost.com.txt b/data/GrabberConfig/articles.washingtonpost.com.txt new file mode 100644 index 00000000..d9d33fc5 --- /dev/null +++ b/data/GrabberConfig/articles.washingtonpost.com.txt @@ -0,0 +1,14 @@ +body: //div[contains(@class, "article_body")] +# print view +body: //div[@id='print_facet']//div[@id='body'] + +#GDPR cookies +http_header(Cookie): wp_devicetype=0; rplpwabt4=1; devicetype=0; osfam=0; de=; client_region=0; wp_gdpr=1|1; rplmct=1; washpost_poe=true; + +tidy: no +prune: no + +single_page_link: concat(substring-before(//div[@id="echo_container_a"]/@guid, '_story.html'), '_print.html') + +test_url: http://articles.washingtonpost.com/2011-10-22/world/35279694_1_germany-acts-german-leaders-chancellor-angela-merkel +test_url: http://articles.washingtonpost.com/2013-05-31/opinions/39658000_1_chemical-weapons-mass-destruction-cartels diff --git a/data/GrabberConfig/artofmanliness.com.txt b/data/GrabberConfig/artofmanliness.com.txt new file mode 100644 index 00000000..c4a8e63e --- /dev/null +++ b/data/GrabberConfig/artofmanliness.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.artofmanliness.com%2F2015%2F12%2F07%2Fhow-to-turn-an-ordinary-routine-into-a-spirit-renewing-ritual%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-body ')] +test_url: http://www.artofmanliness.com/2015/06/29/youve-got-to-be-a-man-before-you-can-be-a-gentleman/ diff --git a/data/GrabberConfig/artsixmic.fr.txt b/data/GrabberConfig/artsixmic.fr.txt new file mode 100644 index 00000000..b8bd6728 --- /dev/null +++ b/data/GrabberConfig/artsixmic.fr.txt @@ -0,0 +1,11 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] + +# strip section about related articles and everything following: +strip: //strong[(text()='A voir aussi sur artsixMic :') or (text()='A lire aussi sur artsixMic :')]/ancestor::*/following-sibling::* +strip: //strong[(text()='A voir aussi sur artsixMic :') or (text()='A lire aussi sur artsixMic :')] + +test_url: https://www.artsixmic.fr/mais-qui-est-donc-marcelline-l-aubergine/ +test_url: https://www.artsixmic.fr/elia-pagliarino-sur-les-traces-des-tatouages-polynesiens/ + + diff --git a/data/GrabberConfig/ascarter.net.txt b/data/GrabberConfig/ascarter.net.txt new file mode 100644 index 00000000..0327e846 --- /dev/null +++ b/data/GrabberConfig/ascarter.net.txt @@ -0,0 +1,5 @@ +title: //h1[@class='article_title'] +author: //span[@class='author'] +date: //h2[@class='dateline'] +body: //div[@class='article_body'] +test_url: http://ascarter.net/2012/02/20/enough-is-enough.html
\ No newline at end of file diff --git a/data/GrabberConfig/astronews.com.txt b/data/GrabberConfig/astronews.com.txt new file mode 100644 index 00000000..8de22270 --- /dev/null +++ b/data/GrabberConfig/astronews.com.txt @@ -0,0 +1,7 @@ +title: //span[@class='titel'] +author: //span[@class='metadaten_C']/a//span[@class='metadaten_C'] +date: substring-after(//span[@class='metadaten_C'],'astronews.com') +strip: //span[@class='bu'] +strip_image_src: '/_images/' + +test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/asymco.com.txt b/data/GrabberConfig/asymco.com.txt new file mode 100644 index 00000000..f639b048 --- /dev/null +++ b/data/GrabberConfig/asymco.com.txt @@ -0,0 +1,8 @@ +# Johannes Stühler + +title://h2 +author://span[@class='meta-content'] +date://abbr[@class='date published']/@title +body://div[@class='entry-content'] + +test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/
\ No newline at end of file diff --git a/data/GrabberConfig/atlantico.fr.txt b/data/GrabberConfig/atlantico.fr.txt new file mode 100644 index 00000000..64ffb544 --- /dev/null +++ b/data/GrabberConfig/atlantico.fr.txt @@ -0,0 +1,12 @@ +body: //div[@id='content'] + +strip_id_or_class: smart-paging-pager +strip_id_or_class: content-bot +strip_id_or_class: article_tools_src +strip_id_or_class: content-side + +next_page_link: //li[@class='pager-next']/a + +date: //div[@class='metas']/time/@datetime + +test_url: http://www.atlantico.fr/decryptage/running-qu-faut-manger-avant-pendant-et-apres-course-pied-jean-daniel-lalau-3153154.html diff --git a/data/GrabberConfig/au.news.yahoo.com.txt b/data/GrabberConfig/au.news.yahoo.com.txt new file mode 100644 index 00000000..65753c4c --- /dev/null +++ b/data/GrabberConfig/au.news.yahoo.com.txt @@ -0,0 +1,5 @@ +strip: //a[contains(text(), "RELATED:")] +author: //div[@class="info"]//span[@class="association printer-source"] +author: //div[@class="info"]//span[@class="stamp printer-date"] + +test_url: https://au.news.yahoo.com/a/31334394/brave-subway-employee-fights-off-masked-robber-using-her-bare-hands/ diff --git a/data/GrabberConfig/autoactu.com.txt b/data/GrabberConfig/autoactu.com.txt new file mode 100644 index 00000000..9b09d573 --- /dev/null +++ b/data/GrabberConfig/autoactu.com.txt @@ -0,0 +1,5 @@ + +body: //div[@id="bloc_actu"]/parent::* +title: //div[@id="content"]/h1[1] + +test_url: http://www.autoactu.com/thomas-owsianski-nomme-president-d-audi-chine.shtml diff --git a/data/GrabberConfig/autoblog.com.txt b/data/GrabberConfig/autoblog.com.txt new file mode 100644 index 00000000..291db992 --- /dev/null +++ b/data/GrabberConfig/autoblog.com.txt @@ -0,0 +1,6 @@ +prune: no +body: //div[@class='post-body'] +author: //p[@class='byline']//a +date: substring-after(//div[@class='about']/p[2], 'Posted') +strip: //div[@class='body']/div[@class='meta'] +test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/
\ No newline at end of file diff --git a/data/GrabberConfig/autocar.co.uk.txt b/data/GrabberConfig/autocar.co.uk.txt new file mode 100644 index 00000000..9f4fe18b --- /dev/null +++ b/data/GrabberConfig/autocar.co.uk.txt @@ -0,0 +1,13 @@ +title: //div[@class='col-center']/h1 +author: //div[@class='personality']/a +date: //div[@class='personality-date'] +body: //div[@class='content-top ']//div[@class='content'][1] | //div[contains(@class,'article-body')] | //div[contains(@class,'main-article')] + +next_page_link: //div[@id='review-link']/a + +strip: //div[@class='author-block'] +strip: //p//iframe[contains(@src,'signup')]/preceding::p[1] + +test_url: http://www.autocar.co.uk/car-review/volkswagen/golf +test_url: http://www.autocar.co.uk/car-news/pebble-beach/saleen-unveils-performance-electric-vehicle-based-tesla-model-s +test_url: http://www.autocar.co.uk/car-review/rolls-royce/first-drives/rolls-royce-ghost-series-ii-first-drive-review diff --git a/data/GrabberConfig/avclub.com.txt b/data/GrabberConfig/avclub.com.txt new file mode 100644 index 00000000..c365a7aa --- /dev/null +++ b/data/GrabberConfig/avclub.com.txt @@ -0,0 +1,4 @@ +author: //*[@id="article_wrapper"]/div[1]/a[1] +body: //*[@id="article_wrapper"]/div[2] +date: //*[@id="article_wrapper"]/div[1]/text()[2] +test_url: http://www.avclub.com/articles/forgetmenot,70904
\ No newline at end of file diff --git a/data/GrabberConfig/baltimoresun.com.txt b/data/GrabberConfig/baltimoresun.com.txt new file mode 100644 index 00000000..35b62427 --- /dev/null +++ b/data/GrabberConfig/baltimoresun.com.txt @@ -0,0 +1,12 @@ +single_page_link: //div[@class='toppaginate']//a[@rel='nofollow'] +convert_double_br_tags: yes + +title: //div[@class="story"]/h1 +body: //div[@id="story-body-text"] +author: //span[@class="byline"] +date: //p[@class="date"] + +strip: //*[@class='all'] +strip: //*[@class='articlerail'] + +test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story
\ No newline at end of file diff --git a/data/GrabberConfig/baseballprospectus.com.txt b/data/GrabberConfig/baseballprospectus.com.txt new file mode 100644 index 00000000..1207b343 --- /dev/null +++ b/data/GrabberConfig/baseballprospectus.com.txt @@ -0,0 +1,13 @@ +title: //h1[@class='title'] +author: //p[@class="author"]/a[1] +body: //div[@class="article"] +date: //p[@class="date"] + +# remove user tools +strip: //div[@class='tools'] +strip: //h1 +strip: //h2[@class='subtitle'] +strip: //p[@class='author'] +strip: //p[@class='date'] + +test_url: http://www.baseballprospectus.com/article.php?articleid=18463
\ No newline at end of file diff --git a/data/GrabberConfig/basicthinking.de.txt b/data/GrabberConfig/basicthinking.de.txt new file mode 100644 index 00000000..f08c1f26 --- /dev/null +++ b/data/GrabberConfig/basicthinking.de.txt @@ -0,0 +1,7 @@ +title: //h2 +date: //span[@class='date'] +body: //div[@class='entry'] + +strip: //div[@class='zusatz'] + +test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/
\ No newline at end of file diff --git a/data/GrabberConfig/basketeurope.com.txt b/data/GrabberConfig/basketeurope.com.txt new file mode 100644 index 00000000..085c87f0 --- /dev/null +++ b/data/GrabberConfig/basketeurope.com.txt @@ -0,0 +1,9 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' bigslam-single-article-content ')] + +author: //div[contains(concat(' ',normalize-space(@class),' '),' bigslam-blog-info-auteurBE ')] + +strip_id_or_class: wp_rp_wrap +strip_id_or_class: arm_setup_form_container + +test_url: https://www.basketeurope.com/livenews-fr/lnb/436191/paris-basketball-la-premiere-pierre-est-posee-larena-de-la-porte-de-la-chapelle-cest-le-futur/ diff --git a/data/GrabberConfig/bastamag.net.txt b/data/GrabberConfig/bastamag.net.txt new file mode 100644 index 00000000..da5fba7e --- /dev/null +++ b/data/GrabberConfig/bastamag.net.txt @@ -0,0 +1,14 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.bastamag.net%2FEncadrement-des-loyers-pourquoi-Anne-Hidalgo-ferait-bien-de-visiter-Vienne-l + +author: //a[contains(@class, "url")] +date: //time[@pubdate]/@datetime +body: //div[@id='content']//article + +strip_id_or_class: appel-soutien +strip_id_or_class: liste dossiers-content +strip: //header[@class="cartouche"] +strip: //div[contains(@class, "appel_don")] + +test_url: https://www.bastamag.net/Pour-chaque-personne-a-la-rue-la-France-compte-trois-logements-vacants diff --git a/data/GrabberConfig/baylon-industries.com.txt b/data/GrabberConfig/baylon-industries.com.txt new file mode 100644 index 00000000..381e0a7d --- /dev/null +++ b/data/GrabberConfig/baylon-industries.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.baylon-industries.com%2Fnews%2F%3Fp%3D1440 + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry_content ')] +test_url: http://www.baylon-industries.com/news/?p=1440 diff --git a/data/GrabberConfig/bbc.co.uk.txt b/data/GrabberConfig/bbc.co.uk.txt new file mode 100644 index 00000000..5fd83c11 --- /dev/null +++ b/data/GrabberConfig/bbc.co.uk.txt @@ -0,0 +1,73 @@ +body: //div[@id="story-body"] +# for video entries +body: //div[contains(@class, "videoInStory") or @id="meta-information"] +title: //h1[@class="story-header"] +date: //span[@class="story-date"]/span[@class='date'] +# for sport site +date: //meta[@name='DCTERMS.created']/@content +author: //div[@id='headline']//span[@class='byline-name'] + +# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055 +body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] + +#strip: //div[@class="story-feature narrow"] +#strip: //div[@class="story-feature wide"] +#strip: //div[@class="story-feature dslideshow-enclosure"] +strip: //div[contains(@class, "story-feature") and not(contains(@class, 'full-width'))] +strip: //span[@class="story-date"] +#strip: //div[@class="caption body-narrow-width"] +strip: //div[@class="warning"]//p +strip: //div[@id='page-bookmark-links-head'] +strip: //object +strip: //div[contains(@class, "bbccom_advert_placeholder")] +strip: //div[contains(@class, "embedded-hyper")] +strip: //div[contains(@class, 'market-data')] +strip: //a[contains(@class, 'hidden')] +strip: //div[contains(@class, 'hypertabs')] +strip: //div[contains(@class, 'related')] +strip: //form[@id='comment-form'] +strip: //div[contains(@class, 'comment-introduction')] +strip: //div[contains(@class, 'share-tools')] +strip: //div[@id='also-related-links'] + +strip: //figcaption +strip_id_or_class: image-and-copyright-container + +strip: //aside[contains(@class, 'sp-pullout')] + +strip_id_or_class: share-help +strip_id_or_class: comments_module + +find_string: http://ichef.bbci.co.uk/news/200/ +replace_string: http://ichef.bbci.co.uk/news/624/ + +find_string: http://ichef.bbci.co.uk/news/304/ +replace_string: http://ichef.bbci.co.uk/news/624/ + +find_string: http://ichef.bbci.co.uk/news/320/ +replace_string: http://ichef.bbci.co.uk/news/660/ + +replace_string({width}{hidpi}): 624 + +replace_string(<noscript>): <div> +replace_string(</noscript>): </div> + +native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')] + +tidy: no +prune: no + +dissolve: //h2 + +test_url: http://www.bbc.co.uk/sport/0/football/23224017 +test_contains: Swansea City have completed the club-record signing + +test_url: http://www.bbc.co.uk/news/business-15060862 +test_contains: Europe's leaders are meeting again to try to solve + +# news feed +test_url: http://feeds.bbci.co.uk/news/rss.xml +# sports feed +test_url: http://feeds.bbci.co.uk/sport/0/football/rss.xml?edition=int +# video entry +test_url: http://www.bbc.co.uk/news/world-asia-22056933 diff --git a/data/GrabberConfig/bbc.com.txt b/data/GrabberConfig/bbc.com.txt new file mode 100644 index 00000000..b8fb3e27 --- /dev/null +++ b/data/GrabberConfig/bbc.com.txt @@ -0,0 +1,76 @@ +body: //div[@id="story-body"] +# for video entries +body: //div[contains(@class, "videoInStory") or @id="meta-information"] +title: //h1[@class="story-header"] +date: //span[@class="story-date"]/span[@class='date'] +# for sport site +date: //meta[@name='DCTERMS.created']/@content +author: //div[@id='headline']//span[@class='byline-name'] + +# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055 +body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] + +#strip: //div[@class="story-feature narrow"] +#strip: //div[@class="story-feature wide"] +#strip: //div[@class="story-feature dslideshow-enclosure"] +strip: //div[contains(@class, "story-feature") and not(contains(@class, 'full-width'))] +strip: //span[@class="story-date"] +#strip: //div[@class="caption body-narrow-width"] +strip: //div[@class="warning"]//p +strip: //div[@id='page-bookmark-links-head'] +strip: //object +strip: //div[contains(@class, "bbccom_advert_placeholder")] +strip: //div[contains(@class, "embedded-hyper")] +strip: //div[contains(@class, 'market-data')] +strip: //a[contains(@class, 'hidden')] +strip: //div[contains(@class, 'hypertabs')] +strip: //div[contains(@class, 'related')] +strip: //form[@id='comment-form'] +strip: //div[contains(@class, 'comment-introduction')] +strip: //div[contains(@class, 'share-tools')] +strip: //div[@id='also-related-links'] + +strip: //figcaption +strip_id_or_class: image-and-copyright-container + +strip: //aside[contains(@class, 'sp-pullout')] + +strip_id_or_class: share-help +strip_id_or_class: comments_module + +find_string: http://ichef.bbci.co.uk/news/200/ +replace_string: http://ichef.bbci.co.uk/news/624/ + +find_string: http://ichef.bbci.co.uk/news/304/ +replace_string: http://ichef.bbci.co.uk/news/624/ + +find_string: http://ichef.bbci.co.uk/news/320/ +replace_string: http://ichef.bbci.co.uk/news/660/ + +replace_string({width}{hidpi}): 624 + +replace_string(<noscript>): <div> +replace_string(</noscript>): </div> + +native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')] + +tidy: no +prune: no + +dissolve: //h2 + +test_url: http://www.bbc.com/sport/0/football/28918021 +test_contains: Cameroonian footballer Albert Ebosse has died + +test_url: http://www.bbc.com/sport/0/football/23224017 + +test_url: http://www.bbc.com/news/business-15060862 +test_contains: Europe's leaders are meeting again to try + + +# news feed +test_url: http://feeds.bbci.co.uk/news/rss.xml +# sports feed +test_url: http://feeds.bbci.co.uk/sport/0/football/rss.xml?edition=int +# video entry +test_url: http://www.bbc.com/news/world-asia-22056933 diff --git a/data/GrabberConfig/bbcgoodfood.com.txt b/data/GrabberConfig/bbcgoodfood.com.txt new file mode 100644 index 00000000..1547d625 --- /dev/null +++ b/data/GrabberConfig/bbcgoodfood.com.txt @@ -0,0 +1,16 @@ +title: //header//h1 +#body: //article[contains(@class, 'node-full')] +body: //div[contains(@class, 'recipe-details') or contains(@class, 'tips-carousel')] | //section[@id='recipe-ingredients' or @id='recipe-method'] + +strip_id_or_class: recipe-rating-wrapper +strip_id_or_class: magazine-subcribe-header +strip_id_or_class: hide +strip_id_or_class: recipe-actions +strip_id_or_class: buy-ingredients +strip_id_or_class: related-content +strip_id_or_class: recipe-magazine-ad +strip_id_or_class: copy-right + +prune: no + +test_url: http://www.bbcgoodfood.com/recipes/1131634/minced-beef-wellington
\ No newline at end of file diff --git a/data/GrabberConfig/bearmetal.eu.txt b/data/GrabberConfig/bearmetal.eu.txt new file mode 100644 index 00000000..44d34655 --- /dev/null +++ b/data/GrabberConfig/bearmetal.eu.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fbearmetal.eu%2Ftheden%2Fits-not-about-you%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: https://bearmetal.eu/theden/its-not-about-you/ diff --git a/data/GrabberConfig/becomingminimalist.com.txt b/data/GrabberConfig/becomingminimalist.com.txt new file mode 100644 index 00000000..79335884 --- /dev/null +++ b/data/GrabberConfig/becomingminimalist.com.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.becomingminimalist.com%2Fmost-popular-posts%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: http://www.becomingminimalist.com/most-popular-posts/ +test_url: http://www.becomingminimalist.com/the-10-most-important-things-to-simplify-in-your-life/ diff --git a/data/GrabberConfig/begeek.fr.txt b/data/GrabberConfig/begeek.fr.txt new file mode 100644 index 00000000..5acec030 --- /dev/null +++ b/data/GrabberConfig/begeek.fr.txt @@ -0,0 +1,17 @@ +title: //div[@class='title']/h1 +author: //div[@class="author_link"]/span/a[@itemprop="author"] +date: //div[@id="publish_post"]/time/@datetime +body: //section[@id="single"] + +strip: //div[@class="title"] +strip: //div[@class="info_top"] +strip: //div[@class="follow-tools"] +strip: //div[@class="author"] +strip: //div[@id="share_post"] +strip: //div[@id="topic_ass"] +strip: //div[@id="taboola-below-article-thumbs-mix"] +strip: //section[@id="facebook-com"] +strip: //section[@id="related_post"] +strip: //div[@id="ligatus"] + +test_url: http://www.begeek.fr/videos-amazon-va-defier-youtube-france-202647 diff --git a/data/GrabberConfig/benoitmaison.org.txt b/data/GrabberConfig/benoitmaison.org.txt new file mode 100644 index 00000000..72c1baed --- /dev/null +++ b/data/GrabberConfig/benoitmaison.org.txt @@ -0,0 +1,16 @@ +body: //div[@class="entry-content"] + +# Remove text ‘Tweet’ +strip: //div[@class="entry-content"]/div[last()] + +title: h1[@class="entry-title"] + +# If the Instapaper text parser worked with HTML5 tags, we would use: +date: //time[@class="entry-date"] + +# But since it does not, use this more complicated rule: +date: //div[@class="entry-meta"]/a[@rel="bookmark"] + +# Unfortunately, the following rule is overridden by the automatically found author. +author: ("Benoit Maison") +test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/
\ No newline at end of file diff --git a/data/GrabberConfig/berlingske.dk.txt b/data/GrabberConfig/berlingske.dk.txt new file mode 100644 index 00000000..9f8c41c6 --- /dev/null +++ b/data/GrabberConfig/berlingske.dk.txt @@ -0,0 +1,3 @@ +title: //h1[@class='headline'] +body: //div[contains(@class, 'article-wrapper')] +test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa
\ No newline at end of file diff --git a/data/GrabberConfig/bernama.com.txt b/data/GrabberConfig/bernama.com.txt new file mode 100644 index 00000000..fdc04b7f --- /dev/null +++ b/data/GrabberConfig/bernama.com.txt @@ -0,0 +1,5 @@ +body: //div[contains(@class, "NewsText"] +prune: no + +test_url: http://www.bernama.com/bernama/v7/rss/english.php +test_url: http://www.bernama.com/bernama/v7/newsindex.php?id=943513
\ No newline at end of file diff --git a/data/GrabberConfig/betabeat.com.txt b/data/GrabberConfig/betabeat.com.txt new file mode 100644 index 00000000..7815cf26 --- /dev/null +++ b/data/GrabberConfig/betabeat.com.txt @@ -0,0 +1,2 @@ +body: //div[@class="entry-content"] +test_url: http://www.betabeat.com/2011/07/04/sheryl-sandberg-breaks-through-silicon-valleys-boys-club-sort-of/
\ No newline at end of file diff --git a/data/GrabberConfig/betanews.com.txt b/data/GrabberConfig/betanews.com.txt new file mode 100644 index 00000000..90a54a23 --- /dev/null +++ b/data/GrabberConfig/betanews.com.txt @@ -0,0 +1,7 @@ +# some articles at this site like this one doesn't +# seem to pick up the article body via normal +# processing, other articles come through fine +# http://www.betanews.com/joewilcox/article +# /Google-is-a-marketing-sensation/1309708375 +body: //*[@id="article"] +test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375
\ No newline at end of file diff --git a/data/GrabberConfig/bez.es.txt b/data/GrabberConfig/bez.es.txt new file mode 100644 index 00000000..cab7c6f7 --- /dev/null +++ b/data/GrabberConfig/bez.es.txt @@ -0,0 +1,5 @@ +body: //div[@class='text_art'] +strip: //div[@class='cab_datos_opinion'] +strip: //div[@class='sumario2_left'] + +test_url: http://www.bez.es/382758623/otros-fracasos-empresas.html diff --git a/data/GrabberConfig/biography.com.txt b/data/GrabberConfig/biography.com.txt new file mode 100644 index 00000000..e431037a --- /dev/null +++ b/data/GrabberConfig/biography.com.txt @@ -0,0 +1,8 @@ +title: //div[contains(@class, 'main-content')]//h1 +body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')] + +prune: no + +single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')] + +test_url: http://www.biography.com/print/profile/martin-luther-9389283
\ No newline at end of file diff --git a/data/GrabberConfig/birthdayshoes.com.txt b/data/GrabberConfig/birthdayshoes.com.txt new file mode 100644 index 00000000..b2b894fa --- /dev/null +++ b/data/GrabberConfig/birthdayshoes.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fbirthdayshoes.com%2Fwhy-toe-shoes + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post ')] +test_url: http://birthdayshoes.com/why-toe-shoes
\ No newline at end of file diff --git a/data/GrabberConfig/bit-tech.net.txt b/data/GrabberConfig/bit-tech.net.txt new file mode 100644 index 00000000..c6f5b204 --- /dev/null +++ b/data/GrabberConfig/bit-tech.net.txt @@ -0,0 +1,19 @@ +body: //div[@id='column_1'] +next_page_link: //div[@class='next']/a[not(contains(@href, '/comments') or contains(@href, '/news/'))] +prune: no + +author: substring-after(//p[@class='byline'], 'by ') +date: substring-before(substring-after(//p[@class='byline'], 'on '), ' by') + +strip: //h1 +strip_id_or_class: socialLinks +strip_id_or_class: byline +strip_id_or_class: pageSelector +strip_id_or_class: articleTabs +strip_id_or_class: pageNav +strip_id_or_class: share +strip_id_or_class: commentsContainer +strip_id_or_class: below_article_related + +test_url: http://www.bit-tech.net/hardware/storage/2014/08/13/ocz-arc-100-240gb-review/1 +test_url: http://www.bit-tech.net/news/bits/2014/08/15/google-trojan/1 diff --git a/data/GrabberConfig/bitelia.com.txt b/data/GrabberConfig/bitelia.com.txt new file mode 100644 index 00000000..7bffae93 --- /dev/null +++ b/data/GrabberConfig/bitelia.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://bitelia.com/2011/09/klout-midiendo-influencia
\ No newline at end of file diff --git a/data/GrabberConfig/bizjournals.com.txt b/data/GrabberConfig/bizjournals.com.txt new file mode 100644 index 00000000..cfba766f --- /dev/null +++ b/data/GrabberConfig/bizjournals.com.txt @@ -0,0 +1,13 @@ +date: //meta[@name='publish-date']/@content +body: //div[contains(@class, 'articleContentWrapper')] +prune: no + +strip: //div[contains(@class, 'staff_info')]//dd[contains(., 'Twitter')] + +strip_id_or_class: related_content +strip_id_or_class: enlarge +strip_id_or_class: photoBy +strip_id_or_class: older + +test_url: http://www.bizjournals.com/cincinnati/news/2013/10/03/harris-teeter-shareholders-vote-on.html +test_url: http://feeds.bizjournals.com/industry_20?format=xml
\ No newline at end of file diff --git a/data/GrabberConfig/bjango.com.txt b/data/GrabberConfig/bjango.com.txt new file mode 100644 index 00000000..0fed5526 --- /dev/null +++ b/data/GrabberConfig/bjango.com.txt @@ -0,0 +1,7 @@ +title: //h1[@class='articlehead'] +body: //div[@class='column'] +strip: //h1 +strip: //div[@class='help'] + +#no author or date/time provided in current layout +test_url: http://bjango.com/articles/actions/
\ No newline at end of file diff --git a/data/GrabberConfig/bleacherreport.com.txt b/data/GrabberConfig/bleacherreport.com.txt new file mode 100644 index 00000000..9205e44e --- /dev/null +++ b/data/GrabberConfig/bleacherreport.com.txt @@ -0,0 +1,16 @@ +body: //div[contains(@class, 'article_pages')] + +strip_id_or_class: article_page-header +strip_id_or_class: paginator +strip_id_or_class: article_info + +find_string: src="data:image +replace_string: ignore-src="data:image +find_string: data-defer-src=" +replace_string: src=" + +prune: no + +test_url: http://bleacherreport.com/articles/feed +test_url: http://bleacherreport.com/articles/2137787-christian-ponders-newborn-daughter-was-named-after-fsu-legend-bobby-bowden +test_url: http://bleacherreport.com/articles/2137596-college-football-week-1-picks-unlv-runnin-rebels-vs-arizona-wildcats/
\ No newline at end of file diff --git a/data/GrabberConfig/blog.asmartbear.com.txt b/data/GrabberConfig/blog.asmartbear.com.txt new file mode 100644 index 00000000..78d7f516 --- /dev/null +++ b/data/GrabberConfig/blog.asmartbear.com.txt @@ -0,0 +1,7 @@ +title: //title +author: //span[@class='author vcard']/a +date: //p[@class='headline_meta']/abbr[@class='published'] +body: //div[@class='format_text entry-content'] + +strip: //div[@id='dd_ajax_float'] +test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html
\ No newline at end of file diff --git a/data/GrabberConfig/blog.cloudflare.com.txt b/data/GrabberConfig/blog.cloudflare.com.txt new file mode 100644 index 00000000..2f9a5a2f --- /dev/null +++ b/data/GrabberConfig/blog.cloudflare.com.txt @@ -0,0 +1,10 @@ +# Instapaper gets this back to front and only gets the blog title instead of the article title. +title: //div[@class='title'] + +author: //a[ contains(@href, '/people') ] + +body: //div[ @class='post-content' ] + +# Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous. +test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n +test_url: https://blog.cloudflare.com/sha-1-deprecation-no-browser-left-behind diff --git a/data/GrabberConfig/blog.eleven-labs.com.txt b/data/GrabberConfig/blog.eleven-labs.com.txt new file mode 100644 index 00000000..a81228e5 --- /dev/null +++ b/data/GrabberConfig/blog.eleven-labs.com.txt @@ -0,0 +1,4 @@ +body://div[@class='post_inner_wrapper'] +date://div[@class='sub_page_caption'] +strip://div[contains(concat(' ',normalize-space(@class),' '),' post_img ')] +test_url: http://blog.eleven-labs.com/en/cqrs-pattern-2/ diff --git a/data/GrabberConfig/blog.eng.xogrp.com.txt b/data/GrabberConfig/blog.eng.xogrp.com.txt new file mode 100644 index 00000000..1daeb76c --- /dev/null +++ b/data/GrabberConfig/blog.eng.xogrp.com.txt @@ -0,0 +1,2 @@ +title: //article[contains(@class, 'type_text')]//h2 +test_url: http://blog.eng.xogrp.com/post/154005485319/node-js-promise-enterprise-grade-first-of-all diff --git a/data/GrabberConfig/blog.fefe.de.txt b/data/GrabberConfig/blog.fefe.de.txt new file mode 100644 index 00000000..97e48e69 --- /dev/null +++ b/data/GrabberConfig/blog.fefe.de.txt @@ -0,0 +1,5 @@ +title: //h2 +date: //h3 +body: //ul + +test_url: http://blog.fefe.de/?ts=b063bf55
\ No newline at end of file diff --git a/data/GrabberConfig/blog.instagram.com.txt b/data/GrabberConfig/blog.instagram.com.txt new file mode 100644 index 00000000..13d1d44a --- /dev/null +++ b/data/GrabberConfig/blog.instagram.com.txt @@ -0,0 +1,11 @@ +# clean Instagram blog a little bit + +tidy:no +prune:no + +body://div[contains(@id,'content')] + +strip_id_or_class:meta +strip_id_or_class:notes +strip_id_or_class:pagination +test_url: http://blog.instagram.com/post/8757832007/fromwhereistand
\ No newline at end of file diff --git a/data/GrabberConfig/blog.instapaper.com.txt b/data/GrabberConfig/blog.instapaper.com.txt new file mode 100644 index 00000000..fda01b15 --- /dev/null +++ b/data/GrabberConfig/blog.instapaper.com.txt @@ -0,0 +1,9 @@ +author: //a[@href="http://www.marco.org/about"] +date: //span[@class="date"] + +# Remove the date from article body. +strip: //span[@class="date"] + +# Remove pagination links from article body. +strip: //div[@id="pagination"] +test_url: http://blog.instapaper.com/post/31303984531
\ No newline at end of file diff --git a/data/GrabberConfig/blog.kaelig.fr.txt b/data/GrabberConfig/blog.kaelig.fr.txt new file mode 100644 index 00000000..f65f096a --- /dev/null +++ b/data/GrabberConfig/blog.kaelig.fr.txt @@ -0,0 +1,5 @@ +body: //*[contains(@class, 'post_content')] +author: string('Kaelig Deloumeau-Prigent') +title: //h1[@class='title'] +date: //span[@class='date'] +test_url: http://blog.kaelig.fr/post/24877648508/pr%C3%A9processeurs-css-renoncer-par-choix-ou-par diff --git a/data/GrabberConfig/blog.mozilla.org.txt b/data/GrabberConfig/blog.mozilla.org.txt new file mode 100644 index 00000000..f002ba4b --- /dev/null +++ b/data/GrabberConfig/blog.mozilla.org.txt @@ -0,0 +1,6 @@ +title: //h1 +author: //address +date: //div[@class="entry-info"]//time/@datetime +body: //div[@class="entry-content"] + +test_url: https://blog.mozilla.org/blog/2018/06/07/parlez-vous-deutsch-rhagor-o-leisiau-i-common-voice/ diff --git a/data/GrabberConfig/blog.naver.com.txt b/data/GrabberConfig/blog.naver.com.txt new file mode 100644 index 00000000..73c30c47 --- /dev/null +++ b/data/GrabberConfig/blog.naver.com.txt @@ -0,0 +1,6 @@ +title: //span[@class='pcol1 itemSubjectBoldfont'] +body: //div[@id='postListBody'] +date: //p[@class='date fil5 pcol2'] +single_page_link: /html/frameset/frame[1]/attribute::src +strip: //div[@class='post-btn'] +test_url: http://blog.naver.com/how2invest/110135068757
\ No newline at end of file diff --git a/data/GrabberConfig/blog.nightly.mozilla.org.txt b/data/GrabberConfig/blog.nightly.mozilla.org.txt new file mode 100644 index 00000000..cb5b2eae --- /dev/null +++ b/data/GrabberConfig/blog.nightly.mozilla.org.txt @@ -0,0 +1,4 @@ +title: //h1 +date: //time/@datetime + +test_url: https://blog.nightly.mozilla.org/2018/06/27/protecting-your-privacy-in-firefox-pre-release/ diff --git a/data/GrabberConfig/blog.niqnutn.com.txt b/data/GrabberConfig/blog.niqnutn.com.txt new file mode 100644 index 00000000..3eee9b30 --- /dev/null +++ b/data/GrabberConfig/blog.niqnutn.com.txt @@ -0,0 +1,9 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fblog.niqnutn.com%2Farticle35%2Fsteganographie-l-art-de-dissimuler-un-message + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article-content ')]//section + +strip_id_or_class: sommaire + +test_url: http://blog.niqnutn.com/index.php?article69/commandes-en-vrac diff --git a/data/GrabberConfig/blog.pchome.net.txt b/data/GrabberConfig/blog.pchome.net.txt new file mode 100644 index 00000000..de81beba --- /dev/null +++ b/data/GrabberConfig/blog.pchome.net.txt @@ -0,0 +1,12 @@ +# PCHOME blog, a popular Chinese blog host +# Oct 15, 2011 +# + +title://*[contains(@class,'imp')]/h2 + +date://*[contains(@class,'imp')]/span +body://div[contains(@id,'blog_content')] + + + +test_url: http://blog.pchome.net/article/462502.html
\ No newline at end of file diff --git a/data/GrabberConfig/blog.pinboard.in.txt b/data/GrabberConfig/blog.pinboard.in.txt new file mode 100644 index 00000000..40f0c560 --- /dev/null +++ b/data/GrabberConfig/blog.pinboard.in.txt @@ -0,0 +1,6 @@ +title: //a[@class="blog_title"] +date: //p[@class="when"]/a +body: //div[@class="blog_entry"] +strip_id_or_class:blog_title +strip_id_or_class:when +test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/
\ No newline at end of file diff --git a/data/GrabberConfig/blog.renren.com.txt b/data/GrabberConfig/blog.renren.com.txt new file mode 100644 index 00000000..01938428 --- /dev/null +++ b/data/GrabberConfig/blog.renren.com.txt @@ -0,0 +1,11 @@ +# This filter is tested on: +# http://blog.renren.com/share/224959024/14260739544 +# http://blog.renren.com/share/231323504/14261768898 +# http://blog.renren.com/share/230305019/1502806705 + +title://h1[contains(@class, 'title-article')] +author://span[contains(@class, 'name')] +body://div[contains(@class, 'content-body')] + +convert_double_br_tags:yes +test_url: http://blog.renren.com/share/224959024/14260739544 diff --git a/data/GrabberConfig/blog.sina.com.cn.txt b/data/GrabberConfig/blog.sina.com.cn.txt new file mode 100644 index 00000000..4895272a --- /dev/null +++ b/data/GrabberConfig/blog.sina.com.cn.txt @@ -0,0 +1,26 @@ +# Sina blog, the most popular blog host in China. +# Its source code is horrible. +# +# Issue: +# Only the first image in the article is displayed. +# The rest images are replace by a 1x1 transparent gif by sina blog host. +# + +title://*[contains(@class,'titName SG_txta')] +author://*[contains(@id,'ownernick')] +date://*[contains(@class,'time SG_txtc')] +body://div[contains(@class,'articalContent')] + +# Remove redundant content which has span class start with "MASS" +# Example <span class="MASSf21674ffeef7"></span> +strip://span[contains(@class,'MASS')] + +# Remove comment +strip://div[contains(@class,'allComm')] + +# Remove hiden text and link +strip://ins + +tidy:no +convert_double_br_tags:yes +test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html
\ No newline at end of file diff --git a/data/GrabberConfig/blog.spu.edu.txt b/data/GrabberConfig/blog.spu.edu.txt new file mode 100644 index 00000000..68bd4e39 --- /dev/null +++ b/data/GrabberConfig/blog.spu.edu.txt @@ -0,0 +1,2 @@ +body://div[@class='post'] +test_url: http://blog.spu.edu/lectio/from-the-frying-pan-into-the-fire/
\ No newline at end of file diff --git a/data/GrabberConfig/blog.squad.fr.txt b/data/GrabberConfig/blog.squad.fr.txt new file mode 100644 index 00000000..269e5b52 --- /dev/null +++ b/data/GrabberConfig/blog.squad.fr.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fblog.squad.fr%2Fnon-classe%2Fgit-explique-avec-mes-mots.html + +body: //div[contains(concat(' ',normalize-space(@class),' '),' infinite-single-article-content ')] +test_url: https://blog.squad.fr/non-classe/git-explique-avec-mes-mots.html
\ No newline at end of file diff --git a/data/GrabberConfig/blog.trello.com.txt b/data/GrabberConfig/blog.trello.com.txt new file mode 100644 index 00000000..3c11a817 --- /dev/null +++ b/data/GrabberConfig/blog.trello.com.txt @@ -0,0 +1,7 @@ +title: //div[contain(@class, 'post-header')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-body ')] +author: //a[contains(@class, 'author-link')] +date: //span[contains(@class, 'byline-date')] + +test_url: http://blog.trello.com/how-to-set-better-life-goals +test_url: http://blog.trello.com/6-mistakes-when-you-work-in-office-but-have-remote-team-members diff --git a/data/GrabberConfig/blog.trendmicro.com.txt b/data/GrabberConfig/blog.trendmicro.com.txt new file mode 100644 index 00000000..0d87db7b --- /dev/null +++ b/data/GrabberConfig/blog.trendmicro.com.txt @@ -0,0 +1,13 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fblog.trendmicro.com%2Ftrendlabs-security-intelligence%2Fwinnti-abuses-github%2F + +title: //div[@id='post-title']//h1 + +date: //li[@class='post-date']//div[@class='meta-info']//a + +author: //a[@rel='author'] + +body: //div[@id='pageContent'] + +test_url: http://blog.trendmicro.com/trendlabs-security-intelligence/winnti-abuses-github/
\ No newline at end of file diff --git a/data/GrabberConfig/blog.twitter.com.txt b/data/GrabberConfig/blog.twitter.com.txt new file mode 100644 index 00000000..119c5585 --- /dev/null +++ b/data/GrabberConfig/blog.twitter.com.txt @@ -0,0 +1,5 @@ +body: //div[@id="component-wrapper"] + +strip_id_or_class: tweet-error-text + +test_url: https://blog.twitter.com/developer/en_us/topics/tools/2018/new-developer-requirements-to-protect-our-platform.html diff --git a/data/GrabberConfig/blog.wells.ee.txt b/data/GrabberConfig/blog.wells.ee.txt new file mode 100644 index 00000000..eae6982b --- /dev/null +++ b/data/GrabberConfig/blog.wells.ee.txt @@ -0,0 +1,6 @@ +title: //h2/a[@class="no-link title"] +author: //h2[@id="blog_owner"] +date: //time +strip: //h2/a[@class="no-link title"] +test_url: http://blog.wells.ee/retina +test_url: http://blog.wells.ee/skeuomorphism
\ No newline at end of file diff --git a/data/GrabberConfig/blog.xebia.fr.txt b/data/GrabberConfig/blog.xebia.fr.txt new file mode 100644 index 00000000..de41260c --- /dev/null +++ b/data/GrabberConfig/blog.xebia.fr.txt @@ -0,0 +1,14 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fblog.xebia.fr%2F2015%2F11%2F18%2Fretour-sur-dockercon-eu-2015-12%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ')] + +strip_id_or_class: yarpp-related +strip_id_or_class: dd_button_v +strip_id_or_class: c1 +strip_id_or_class: dd_end +strip_id_or_class: dd_start +strip_id_or_class: dd_outer + +test_url: http://blog.xebia.fr/2015/11/18/retour-sur-dockercon-eu-2015-12/ diff --git a/data/GrabberConfig/blog.youb.fr.txt b/data/GrabberConfig/blog.youb.fr.txt new file mode 100644 index 00000000..8cba3ab2 --- /dev/null +++ b/data/GrabberConfig/blog.youb.fr.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fblog.youb.fr%2Fcreations%2Fbackup-server + +body: //div[contains(concat(' ',normalize-space(@class),' '),' column ')] +test_url: https://blog.youb.fr/creations/backup-server
\ No newline at end of file diff --git a/data/GrabberConfig/blogs.faz.net.txt b/data/GrabberConfig/blogs.faz.net.txt new file mode 100644 index 00000000..4f2626f1 --- /dev/null +++ b/data/GrabberConfig/blogs.faz.net.txt @@ -0,0 +1,45 @@ +# Author: zinnober + +tidy: no +prune: no + +# Set author +author: //a[@rel='author'] + +# Set date +date: //span[@class='Datum'] + +# Content is here +body: //div[@class='Artikel'] + +# Tidy up before article +strip: //div[@id='FAZHeaderNeu'] +strip: //h2[@itemprop='headline'] +strip: //span[@class='Datum'] +strip: //span[@class='Autor'] +strip_id_or_class: ArticlePagerTop +strip: //div[@class='FAZArtikelEinleitung']/h2 + +# General cleanup +strip: //div[@class='clear'] +strip: //span[@class='Bildnachweis'] +strip: //iframe +strip_id_or_class: Community +strip: ' · ' + +# Remove tracking and ads +strip_image_src: /l.gif? +strip: //img[@width='1'] +strip_id_or_class: invisible +strip_id_or_class: Anzeige +strip_id_or_class: billboard + +# Remove clutter after article +strip_id_or_class: Tagline +strip_id_or_class: ArtikelAbbinder +strip_id_or_class: FAZArtikelKommentare +strip_id_or_class: ArtikelKommentieren +strip_id_or_class: FAZContentRight + +# Try it yourself +test_url: http://blogs.faz.net/wost/2014/08/17/viel-fuck-und-wenig-guter-sex-1239/ diff --git a/data/GrabberConfig/blogs.forbes.com.txt b/data/GrabberConfig/blogs.forbes.com.txt new file mode 100644 index 00000000..86580d21 --- /dev/null +++ b/data/GrabberConfig/blogs.forbes.com.txt @@ -0,0 +1,2 @@ +body: //div[@class='entry'] +test_url: http://blogs.forbes.com/adamhartung/2011/04/08/apple-is-better-managed-than-microsoft/
\ No newline at end of file diff --git a/data/GrabberConfig/blogs.gnome.org.txt b/data/GrabberConfig/blogs.gnome.org.txt new file mode 100644 index 00000000..d018c410 --- /dev/null +++ b/data/GrabberConfig/blogs.gnome.org.txt @@ -0,0 +1,3 @@ +http_header(user-agent): PHP/7.2 + +test_url: https://blogs.gnome.org/aday/2017/08/08/the-gnome-way/ diff --git a/data/GrabberConfig/blogs.hbr.org.txt b/data/GrabberConfig/blogs.hbr.org.txt new file mode 100644 index 00000000..d47c3520 --- /dev/null +++ b/data/GrabberConfig/blogs.hbr.org.txt @@ -0,0 +1,4 @@ +title: //div[@id='pageFeature']/h1 +body: //div[@id='articleBody'] +strip: //div[@class='module wide'] +test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29
\ No newline at end of file diff --git a/data/GrabberConfig/blogs.msdn.com.txt b/data/GrabberConfig/blogs.msdn.com.txt new file mode 100644 index 00000000..11b8d42d --- /dev/null +++ b/data/GrabberConfig/blogs.msdn.com.txt @@ -0,0 +1,6 @@ +title: //h3[@class="post-name"] +author: //span[@class="user-name"] +date: //div[@class="post-date"]/span[@class="value"] +body: //div[@class="post-content user-defined-markup"] +footnotes: no +test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx diff --git a/data/GrabberConfig/blogs.reuters.com.txt b/data/GrabberConfig/blogs.reuters.com.txt new file mode 100644 index 00000000..d3eb9966 --- /dev/null +++ b/data/GrabberConfig/blogs.reuters.com.txt @@ -0,0 +1,3 @@ +title: //div[@id='single']/h1 +body: //div[@id='postcontent'] +test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/
\ No newline at end of file diff --git a/data/GrabberConfig/blogs.sciencemag.org.txt b/data/GrabberConfig/blogs.sciencemag.org.txt new file mode 100644 index 00000000..955eafde --- /dev/null +++ b/data/GrabberConfig/blogs.sciencemag.org.txt @@ -0,0 +1,12 @@ +title: //h1 + +author: //a[contains(@rel, 'author')] + +# Publication date +date: //time + +body: //div[@class='article__body'] + +prune: no + +test_url: http://blogs.sciencemag.org/pipeline/archives/2017/08/23/ultrasound-for-brain-drug-delivery-not-so-fast diff --git a/data/GrabberConfig/blogs.scientificamerican.com.txt b/data/GrabberConfig/blogs.scientificamerican.com.txt new file mode 100644 index 00000000..2102015d --- /dev/null +++ b/data/GrabberConfig/blogs.scientificamerican.com.txt @@ -0,0 +1,16 @@ +# meta data +title://h1[@class = 'postTitle'] +author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|') +date://span[@class = 'datestamp'] + +#body content +body://div[@id = 'singleBlogPost'] + +#reclaim author info +move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv'] +strip://p[@class = 'moreLink mobileHide'] + +#cleanup comments, there might be some open <div> sections +strip://div[@id = 'comments2'] +strip://h3[a[@href = '#add-comment']] +test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/
\ No newline at end of file diff --git a/data/GrabberConfig/blogs.smithsonianmag.com.txt b/data/GrabberConfig/blogs.smithsonianmag.com.txt new file mode 100644 index 00000000..1bc65e77 --- /dev/null +++ b/data/GrabberConfig/blogs.smithsonianmag.com.txt @@ -0,0 +1,15 @@ +# metadata +author://div[@class = 'post']/div[@class='meta']/a[1] +date://div[@id = 'rap']/h2[1] +body://div[@class = 'post'] + +# wrapping caption and image +wrap_in(fieldset)://div[contains(@class, 'wp-caption')] + + +# clean up +strip://div[@class = 'post']/h3[@class = 'storytitle'] +strip://div[@class = 'post']/div[@class = 'social'] +strip://img[@style = 'display:none;'] +strip://img[@height='0' and @width='0'] +test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/
\ No newline at end of file diff --git a/data/GrabberConfig/blogs.technet.com.txt b/data/GrabberConfig/blogs.technet.com.txt new file mode 100644 index 00000000..3d0fbadc --- /dev/null +++ b/data/GrabberConfig/blogs.technet.com.txt @@ -0,0 +1,9 @@ +title: //h3[@class="post-name"] +author: //span[@class="user-name"] +date: //div[@class="post-date"] +body: //div[@class="post-content user-defined-markup"] +strip_id_or_class: log-feedback-list +tidy: no +footnotes: no +test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx +test_url: http://blogs.technet.com/b/isablog/archive/2009/01/07/a-pptp-client-might-fail-to-connect-to-a-vpn-server-on-the-internet-through-an-isa-server-2006.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/bloomberg.com.txt b/data/GrabberConfig/bloomberg.com.txt new file mode 100644 index 00000000..ecbf2211 --- /dev/null +++ b/data/GrabberConfig/bloomberg.com.txt @@ -0,0 +1,11 @@ +date: //*[@itemprop='datePublished']/@datetime +author: //meta[@name="sailthru.author"]/@content +body: //div[contains(@class, 'body-copy')] + +strip_id_or_class: inline-newsletter +strip_id_or_class: terminal-tout-container +strip_id_or_class: lede + +test_url: http://www.bloomberg.com/news/articles/2015-12-30/will-your-job-disappear-by-2024- +test_contains: Every week, hosts Tori Stilwell +test_url: https://www.bloomberg.com/news/articles/2016-12-06/apple-to-start-publishing-ai-research-to-hasten-deep-learning diff --git a/data/GrabberConfig/boagworld.com.txt b/data/GrabberConfig/boagworld.com.txt new file mode 100644 index 00000000..3b3da991 --- /dev/null +++ b/data/GrabberConfig/boagworld.com.txt @@ -0,0 +1,8 @@ +title: //h1[@class="entry-title"][2] +author: string("Paul Boag") +date: substring(//span[@class="meta"], 11) +body: //article +strip: //h2 +strip: //h1 +strip: //div[@id="callsToAction"] +test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/
\ No newline at end of file diff --git a/data/GrabberConfig/bobbyromeo.com.txt b/data/GrabberConfig/bobbyromeo.com.txt new file mode 100644 index 00000000..dcdbd9f3 --- /dev/null +++ b/data/GrabberConfig/bobbyromeo.com.txt @@ -0,0 +1,4 @@ +strip_id_or_class: adsbygoogle +strip_id_or_class: yarpp-related + +test_url: http://bobbyromeo.com/technology/xiaomi-smart-1080p-wifi-ip-camera-rtsp-streaming-hack/ diff --git a/data/GrabberConfig/bohaishibei.com.txt b/data/GrabberConfig/bohaishibei.com.txt new file mode 100644 index 00000000..7d18d431 --- /dev/null +++ b/data/GrabberConfig/bohaishibei.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fbohaishibei.com%2Fpost%2F26901%2F + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article-content ')] +test_url: https://bohaishibei.com/post/26901/
\ No newline at end of file diff --git a/data/GrabberConfig/boingboing.net.txt b/data/GrabberConfig/boingboing.net.txt new file mode 100644 index 00000000..0e8b1519 --- /dev/null +++ b/data/GrabberConfig/boingboing.net.txt @@ -0,0 +1,13 @@ +# This is far from perfect, but so is BoingBoing's markup +title: //h2[@class="headline"] +single_page_link: //h2[@class="headline"]/a +#date: //p[@class="byline"] +body: //div[@id="story"] + +strip_id_or_class: shareMe +strip_id_or_class: authorbox +strip_id_or_class: byline +strip_id_or_class: share +strip_id_or_class: separator + +test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html diff --git a/data/GrabberConfig/book.douban.com.txt b/data/GrabberConfig/book.douban.com.txt new file mode 100644 index 00000000..fe2d2cbf --- /dev/null +++ b/data/GrabberConfig/book.douban.com.txt @@ -0,0 +1,6 @@ +body: //span[@property='v:description'] +date: //span[@property='v:dtreviewed'] +author: //span[@property='v:reviewer'] +prune: no + +test_url: http://book.douban.com/review/2422662/
\ No newline at end of file diff --git a/data/GrabberConfig/bookforum.com.txt b/data/GrabberConfig/bookforum.com.txt new file mode 100644 index 00000000..03b60039 --- /dev/null +++ b/data/GrabberConfig/bookforum.com.txt @@ -0,0 +1,19 @@ +#metadata +title://div[@class = 'Topper']/h1 +author://div[@class = 'Topper']/h3 +date://div[@class = 'Topper']/h6 +body://div[@class = 'Core'] + + + +# clean up +strip://div[@class = 'Topper']/h1 +strip://div[@class = 'Topper']/h3 +strip://div[@class = 'Topper']/h4 +strip://div[@class = 'Topper']/h5 +strip://div[@class = 'Topper']/h6 +strip://br[@clear = 'all'] +strip://div[@class = 'adCore'] +strip://div[@class = 'BookR'] +strip://div[@class = 'InfoBox'] +test_url: http://bookforum.com/inprint/018_04/8595
\ No newline at end of file diff --git a/data/GrabberConfig/borderhouseblog.com.txt b/data/GrabberConfig/borderhouseblog.com.txt new file mode 100644 index 00000000..b4e116fe --- /dev/null +++ b/data/GrabberConfig/borderhouseblog.com.txt @@ -0,0 +1,7 @@ +title://h1 +author://div[@class="meta"]/span/a +date://div[@class="date"] +body://div[@class="content article"] +strip://div[@class="content article"]/h1 + +test_url: http://borderhouseblog.com/?p=7832
\ No newline at end of file diff --git a/data/GrabberConfig/bostonglobe.com.txt b/data/GrabberConfig/bostonglobe.com.txt new file mode 100644 index 00000000..e6871a01 --- /dev/null +++ b/data/GrabberConfig/bostonglobe.com.txt @@ -0,0 +1,19 @@ +# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com. + +title: //div[@class="header"]/h1 +author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ") +date: //div[@class="byline"]/p[last()] +body: //div[@class="article-body"] + +strip_id_or_class: aside +strip_id_or_class: promo +strip_id_or_class: skip-nav +strip_id_or_class: article-more +strip_id_or_class: article-bar +strip_id_or_class: post-comment + +# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed. +strip_id_or_class: figure + +test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html +test_url: http://www.bostonglobe.com/news/bigpicture/2016/09/13/paralympics/qcL3h79ohDG6O69GRWLRuK/story.html diff --git a/data/GrabberConfig/boundlessline.org.txt b/data/GrabberConfig/boundlessline.org.txt new file mode 100644 index 00000000..a836e1e2 --- /dev/null +++ b/data/GrabberConfig/boundlessline.org.txt @@ -0,0 +1,5 @@ +title: substring-before(//title, '|') +body: //div[@class="entry"] +# Remove the author's picture +strip: //div[@class="entry"]/a[1] +test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html
\ No newline at end of file diff --git a/data/GrabberConfig/brainfacts.org.txt b/data/GrabberConfig/brainfacts.org.txt new file mode 100644 index 00000000..9705f621 --- /dev/null +++ b/data/GrabberConfig/brainfacts.org.txt @@ -0,0 +1,10 @@ +title: //div[@class="standard"]/h1 +author: string("BrainFacts.org") +date: //div[@class="meta"]/strong + +strip: //p[@class="skip"] +strip: //div[@class="meta"] +strip: //div[@class="standard"]/h1 +strip: //div[@class="modal"] +strip: //div[@class="columnRight"] +test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/
\ No newline at end of file diff --git a/data/GrabberConfig/brainpickings.org.txt b/data/GrabberConfig/brainpickings.org.txt new file mode 100644 index 00000000..75bc0555 --- /dev/null +++ b/data/GrabberConfig/brainpickings.org.txt @@ -0,0 +1,7 @@ +title: //h1[@class='entry-title'] +body: //div[@class='entry_content'] +strip: //div[@id='bottom_donation'] +strip: //div[@id='bottom_newsletter'] +strip: //div[@id='end_print'] + +test_url: https://www.brainpickings.org/2016/05/26/river-of-shadows-rebecca-solnit-muybridge/
\ No newline at end of file diff --git a/data/GrabberConfig/brandeins.de.txt b/data/GrabberConfig/brandeins.de.txt new file mode 100644 index 00000000..be326346 --- /dev/null +++ b/data/GrabberConfig/brandeins.de.txt @@ -0,0 +1,9 @@ + +body: //div[@class="articleTeaser"] | //section[@class="contentSection"] + +strip: //section[@class="greenBox italic"] + +author: //div[@class="articleAuthor"] +# no publish date on page (the articles are from a monthly periodical) + +test_url: http://www.brandeins.de/archiv/2015/fuehrung/ministry-group-mach-doch-mal-ne-ansage/ diff --git a/data/GrabberConfig/brandingstrategyinsider.com.txt b/data/GrabberConfig/brandingstrategyinsider.com.txt new file mode 100644 index 00000000..fc020539 --- /dev/null +++ b/data/GrabberConfig/brandingstrategyinsider.com.txt @@ -0,0 +1,3 @@ +date://h2[@class="date-header"] +body://div[@class="entry-content"] +test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html
\ No newline at end of file diff --git a/data/GrabberConfig/brasil.elpais.com.txt b/data/GrabberConfig/brasil.elpais.com.txt new file mode 100644 index 00000000..6a22dcb7 --- /dev/null +++ b/data/GrabberConfig/brasil.elpais.com.txt @@ -0,0 +1,26 @@ +title: //meta[@name='DC.title']/@content +title: //div[contains(@class, 'cabecera_noticia')]//h1 +date: //meta[@name='DC.date']/@content +date: //meta[@name='date']/@content +body: //div[@class='columna_texto'] +body: //div[@id='cuerpo_noticia'] +body: //div[@class='estructura_2col_1zq']//div[@class='margen_n'] + +prune: no + +strip_id_or_class: disposicion_vertical +strip_id_or_class: ampliar_foto +strip_id_or_class: utilidades +strip_id_or_class: info_relacionada +strip_id_or_class: m-kiosko +strip_id_or_class: info_complementa + +strip: //p[@class='nota_pie'] +strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] +strip: //div[@id='coment' or @id='foros_not'] + +test_url: http://brasil.elpais.com/brasil/2014/10/15/politica/1413334841_878730.html +test_contains: O PT quer intensificar a presença do ex-presidente + +test_url: http://brasil.elpais.com/brasil/2014/10/13/internacional/1413225730_450761.html +test_contains: Todos na localidade onde ele nasceu ainda falavam da façanha diff --git a/data/GrabberConfig/brettterpstra.com.txt b/data/GrabberConfig/brettterpstra.com.txt new file mode 100644 index 00000000..55da1787 --- /dev/null +++ b/data/GrabberConfig/brettterpstra.com.txt @@ -0,0 +1,5 @@ +body: //div[@class='post full'] +title: //h1 +author: substring-after(//title, '- ') +date: //span[@class='date'] +test_url: http://brettterpstra.com/byword-for-ios/
\ No newline at end of file diff --git a/data/GrabberConfig/brightside.me.txt b/data/GrabberConfig/brightside.me.txt new file mode 100644 index 00000000..4dd5b89b --- /dev/null +++ b/data/GrabberConfig/brightside.me.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fbrightside.me%2Fwonder-curiosities%2Ffinland-will-become-the-first-country-in-the-world-to-get-rid-of-all-school-subjects-259910%2F + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article ')] +test_url: https://brightside.me/wonder-curiosities/finland-will-become-the-first-country-in-the-world-to-get-rid-of-all-school-subjects-259910/ diff --git a/data/GrabberConfig/brooksreview.net.txt b/data/GrabberConfig/brooksreview.net.txt new file mode 100644 index 00000000..d33d7d4e --- /dev/null +++ b/data/GrabberConfig/brooksreview.net.txt @@ -0,0 +1,6 @@ +title: //h1 +body: //div[@class='article'] +body: //div[@class='post'] +date: //*[@id='single']/span +prune: no +test_url: http://brooksreview.net/2011/11/readability-agency/
\ No newline at end of file diff --git a/data/GrabberConfig/bt.no.txt b/data/GrabberConfig/bt.no.txt new file mode 100644 index 00000000..200c2e4e --- /dev/null +++ b/data/GrabberConfig/bt.no.txt @@ -0,0 +1,12 @@ +title: //h1[contains(@class,'articleTitle')] +author: //span[@itemprop='name'] +date: //time[@class='published'] +body: //div[contains(@class,'bodyText')] + +strip_id_or_class: 'pull1' +strip_id_or_class: 'relationArticle' +strip: //span[@class='quote'] + +# strip h2 if at end of article (typically a request for comments) +strip: //div[contains(@class,'bodyText')]/node()[last()-1]/self::h2 +test_url: http://www.bt.no/meninger/debatt/Typisk-norsk-a-vare-god-nok-2884108.html
\ No newline at end of file diff --git a/data/GrabberConfig/buquad.com.txt b/data/GrabberConfig/buquad.com.txt new file mode 100644 index 00000000..f0fd08db --- /dev/null +++ b/data/GrabberConfig/buquad.com.txt @@ -0,0 +1,8 @@ +title: //h1 +author: //h2/a +date: substring-after(//h2, '|') +strip_id_or_class: 'attachment' +strip: //h3 + +body: //div[@class='entry'] +test_url: http://buquad.com/2012/04/09/paul-ryan/
\ No newline at end of file diff --git a/data/GrabberConfig/business.time.com.txt b/data/GrabberConfig/business.time.com.txt new file mode 100644 index 00000000..41f8d12f --- /dev/null +++ b/data/GrabberConfig/business.time.com.txt @@ -0,0 +1,17 @@ +# 2011-10-25 - carlo@... - Initial setup. + +single_page_link: //li[@class='print']/a/@href + +title: //h1 +author: //meta[@name="byline"]/@content +date: //meta[@name="date"]/@content + +strip: //span[@class="see"] +strip: //div[@class="byline"] +strip: //div[@id="date2"] +strip: //h1 +strip: //div[@class='post-rail-ad'] +strip: //div[@class='post-rail-content'] +strip: //aside[@class='post-rail'] + +test_url: http://business.time.com/2012/08/22/can-entrepreneurship-bring-change-where-the-arab-spring-has-not/ diff --git a/data/GrabberConfig/business2community.com.txt b/data/GrabberConfig/business2community.com.txt new file mode 100644 index 00000000..0dcc7ff8 --- /dev/null +++ b/data/GrabberConfig/business2community.com.txt @@ -0,0 +1,5 @@ +date: substring-after(//p[@class='byline'],'Published') + +strip: //div[@class='article-meta'] + +test_url: http://www.business2community.com/social-media/funky-ways-to-print-instagram-photos-0485340 diff --git a/data/GrabberConfig/businessinsider.com.txt b/data/GrabberConfig/businessinsider.com.txt new file mode 100644 index 00000000..39eb7426 --- /dev/null +++ b/data/GrabberConfig/businessinsider.com.txt @@ -0,0 +1,16 @@ +title://div[@class="sl-layout-post"]/h1 +body: //div[contains(@class, 'post-content') or contains(@class, 'slide-module') or contains(@class, 'KonaBody')] +strip: //div[contains(@class, "post-sidebar")] +strip: //div[@id='related-links'] +strip: //div[@class='related-links-container'] +strip: //p[@class='source'] +author://div[@class="byline"]/a +date://div[@class="byline"]/span[@class="date"] +prune: no + +single_page_link: //a[contains(text(), 'View as one page')] + +strip://*[contains(@class,'sponsored-text')] +strip: //div[@id='post_footer'] + +test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1 diff --git a/data/GrabberConfig/businessnews.com.tn.txt b/data/GrabberConfig/businessnews.com.tn.txt new file mode 100644 index 00000000..6502b8e1 --- /dev/null +++ b/data/GrabberConfig/businessnews.com.tn.txt @@ -0,0 +1,12 @@ +body: //div[@id='article_detail'] +title: //meta[@property='og:title']/@content +date: //div[@id='date_com_art']//a[@class='date'] +author: //div[@id='article_detail']//font[@class='auteur'] + +strip_id_or_class: porte_titre_theme +strip_id_or_class: cont_param +strip_id_or_class: date_com_art + +prune: no + +test_url: http://www.businessnews.com.tn/details_article.php?a=31073&t=522&lang=fr&temp=1
\ No newline at end of file diff --git a/data/GrabberConfig/businessweek.com.txt b/data/GrabberConfig/businessweek.com.txt new file mode 100644 index 00000000..f546b708 --- /dev/null +++ b/data/GrabberConfig/businessweek.com.txt @@ -0,0 +1,17 @@ +# include the lead graphic in the body, if available +body: //div[contains(concat(' ', normalize-space(@id), ' '), ' lead_graphic ')] | //div[contains(concat(' ', normalize-space(@itemprop), ' '), ' articleBody ')] +title: //h1[contains(concat(' ', normalize-space(@itemprop), ' '), ' headline ')] +date: //time[contains(concat(' ', normalize-space(@itemprop), ' '), ' datePublished ')] + +strip_id_or_class: photo_credit +strip_id_or_class: photo_caption +strip_id_or_class: inline_gallery +# pull quote, often inside a blockquote element +strip_id_or_class: pq +strip_id_or_class: credit +strip_id_or_class: figcaption +strip_id_or_class: related_item + +test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html +test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall +test_url: http://www.businessweek.com/articles/2014-07-09/american-apparel-dov-charneys-sleazy-struggle-for-control diff --git a/data/GrabberConfig/buzzfeed.com.txt b/data/GrabberConfig/buzzfeed.com.txt new file mode 100644 index 00000000..5a578953 --- /dev/null +++ b/data/GrabberConfig/buzzfeed.com.txt @@ -0,0 +1,27 @@ +# Creator: Greg Leuch <greg@...> + +# It can be messy. +tidy:no + +# The basic template. +title: //h1[@data-print='title'] +author: //a[@data-print='author'] +date: //time[@data-print='date'] +body: //div[@data-print='body'] +body: //section[@data-print='body'] +body: //article[contains(concat(' ',normalize-space(@class),' '),' buzz ')] + +find_string: rel:bf_image_src= +replace_string: src= +find_string: src="data: +replace_string: disabled_src="data: + +native_ad_clue: //meta[@property="article:section" and @content="Advertiser"] + +# For various things... +strip: *[@data-print="ignore"] +test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays +# Native ad +test_url: http://www.buzzfeed.com/bravo/ways-to-up-your-online-dating-game +# article DIV +test_url: https://www.buzzfeed.com/charliewarzel/the-terrifying-future-of-fake-news diff --git a/data/GrabberConfig/bygonebureau.com.txt b/data/GrabberConfig/bygonebureau.com.txt new file mode 100644 index 00000000..63c82130 --- /dev/null +++ b/data/GrabberConfig/bygonebureau.com.txt @@ -0,0 +1,6 @@ +title: //h1 +author: //a[contains(@href, '/author/')] +date: //*[@class='post-date'] +strip: //*[@class='post-date'] +strip: //h1 +test_url: http://bygonebureau.com/2011/06/20/an-existential-psychoanalysis/
\ No newline at end of file diff --git a/data/GrabberConfig/cable.co.uk.txt b/data/GrabberConfig/cable.co.uk.txt new file mode 100644 index 00000000..435bf3b5 --- /dev/null +++ b/data/GrabberConfig/cable.co.uk.txt @@ -0,0 +1,11 @@ +title: //div[@class='page-content']//h1 +body: //div[@class='page-content'] +strip_id_or_class: editorial-bar-top +strip_id_or_class: social-bottom +strip_id_or_class: comment-form +strip_id_or_class: pc-why + +prune: no +tidy: no + +test_url: http://www.cable.co.uk/news/bt-vision-unveils-interactive-guide-application-800734218/
\ No newline at end of file diff --git a/data/GrabberConfig/cafebabel.com.txt b/data/GrabberConfig/cafebabel.com.txt new file mode 100644 index 00000000..56e8ccf2 --- /dev/null +++ b/data/GrabberConfig/cafebabel.com.txt @@ -0,0 +1,19 @@ + +body: //div[@id='content'] + +date: //div[@id='content']//time/@datetime + +author: //div[@id='content']//a[contains(concat(' ',normalize-space(@class),' '),' author-link ')] + +strip_id_or_class: share +strip_id_or_class: tags-list +strip_id_or_class: author-container +strip_id_or_class: article-participate +strip_id_or_class: translation-details +strip_id_or_class: contributor-container +strip_id_or_class: article-language-selector +strip: //div[@id='content']//h1[1] +strip: (//p//time//ancestor::p[1])[starts-with(normalize-space(.), 'Published on ')] + +test_url: https://cafebabel.com/es/article/ahmad-shamieh-el-refugiado-sirio-que-pone-en-jaque-a-eslovenia-5b20dfc7f723b325ef3dc566/ + diff --git a/data/GrabberConfig/caffereggio.net.txt b/data/GrabberConfig/caffereggio.net.txt new file mode 100644 index 00000000..94eb524a --- /dev/null +++ b/data/GrabberConfig/caffereggio.net.txt @@ -0,0 +1,3 @@ +body: //div[@class="pf-content"] + +test_url: http://www.caffereggio.net/2017/02/09/apoyo-pablo-iglesias-podemos-unidos-podemos-vicenc-navarro-publico/ diff --git a/data/GrabberConfig/canardpc.com.txt b/data/GrabberConfig/canardpc.com.txt new file mode 100644 index 00000000..4b5b89d2 --- /dev/null +++ b/data/GrabberConfig/canardpc.com.txt @@ -0,0 +1,15 @@ +title: //h2[@class="article-title"] +body: //div[@id="zenContent"] | //div[@id="block-articlenote"] +author: //div[@class="article-author"]//a[@class="username"] + +# wallabag-specific login directives (not supported in FTR) +requires_login: yes + +login_uri: https://www.canardpc.com/user/login +login_username_field: name +login_password_field: pass +login_extra_fields: form_id=user_login_form + +not_logged_in_xpath: //div[@class="messages--error"] + +test_url: https://www.canardpc.com/357/fouine-peaks-thimbleweed-park diff --git a/data/GrabberConfig/canonrumors.com.txt b/data/GrabberConfig/canonrumors.com.txt new file mode 100644 index 00000000..c22cf4f1 --- /dev/null +++ b/data/GrabberConfig/canonrumors.com.txt @@ -0,0 +1,28 @@ +# Author: zinnober + +tidy: no +prune: no + +# Set title +title: //h2 + +date: //li[@class='time'] + +# Set author +author: //a[contains(@rel, 'author')] + +# Content is here +body: //div[@id='content'] + +# Tidy up before article +strip: //div[@class='meta'] + +# Tidy up after article +strip_id_or_class: nr_related_placeholder +strip_id_or_class: twitter-share-button +strip_id_or_class: afterpost +strip_id_or_class: tags + +# Try it yourself +test_url: http://www.canonrumors.com/2014/09/chuck-westfall-talks-canon-eos-7d-mark-ii/ +test_url: http://www.canonrumors.com/2014/09/canon-cinema-eos-captures-space-in-4k-for-new-imax-3d-film/ diff --git a/data/GrabberConfig/captaineconomics.fr.txt b/data/GrabberConfig/captaineconomics.fr.txt new file mode 100644 index 00000000..036e0281 --- /dev/null +++ b/data/GrabberConfig/captaineconomics.fr.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.captaineconomics.fr%2F-les-immigres-recemment-arrives-en-france-qui-sont-ils-insee + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post_content ')] +test_url: http://www.captaineconomics.fr/-les-immigres-recemment-arrives-en-france-qui-sont-ils-insee diff --git a/data/GrabberConfig/cardboardconnection.com.txt b/data/GrabberConfig/cardboardconnection.com.txt new file mode 100644 index 00000000..49f34302 --- /dev/null +++ b/data/GrabberConfig/cardboardconnection.com.txt @@ -0,0 +1,8 @@ +title: //h1[@class='producttabbed-title'] +body: //div[@class='postTabs_divs postTabs_curr_div'] +strip: //div[@class='ratingblock2'] +strip: //p[@id='breadcrumbs'] +strip: //div[@style='display: none'] + + +test_url: http://www.cardboardconnection.com/2012-topps-archives-baseball-cards
\ No newline at end of file diff --git a/data/GrabberConfig/carlchenet.com.txt b/data/GrabberConfig/carlchenet.com.txt new file mode 100644 index 00000000..b87e8399 --- /dev/null +++ b/data/GrabberConfig/carlchenet.com.txt @@ -0,0 +1,3 @@ +date: //time/@datetime + +test_url: https://carlchenet.com/foss-passive-consumerism-kills-our-community/ diff --git a/data/GrabberConfig/cars.com.txt b/data/GrabberConfig/cars.com.txt new file mode 100644 index 00000000..71c5c050 --- /dev/null +++ b/data/GrabberConfig/cars.com.txt @@ -0,0 +1,7 @@ +title: //div[contains(@class, 'basicInfo')]//h1 + +body: //img[@id='chosenPhotoIMG'] | //div[@id='aboutThisVehicleBox'] + +prune: no + +test_url: http://www.cars.com/go/search/detail.jsp?listingId=115364779
\ No newline at end of file diff --git a/data/GrabberConfig/catb.org.txt b/data/GrabberConfig/catb.org.txt new file mode 100644 index 00000000..2cd197fb --- /dev/null +++ b/data/GrabberConfig/catb.org.txt @@ -0,0 +1,7 @@ +body: //div[@class='article'] +strip: //div[@class='revhistory'] +strip: //div[@class='toc'] +tidy: no +prune: no + +test_url: http://catb.org/~esr/faqs/smart-questions.html
\ No newline at end of file diff --git a/data/GrabberConfig/cbsnews.com.txt b/data/GrabberConfig/cbsnews.com.txt new file mode 100644 index 00000000..04d20230 --- /dev/null +++ b/data/GrabberConfig/cbsnews.com.txt @@ -0,0 +1,15 @@ +date: //meta[@name="published"]/@content +date: //div[@class="timeLine"] +title: //div[@id='contentBody']//h1 +author: //dl[@class="storyBlogByline"]/dd/a +body: //div[@id='storyMediaBox'] | //div[contains(@class, 'storyText')] + +# Content Pruning +strip: //div[@class="scrollingArrows"] +strip: //div[@class="timeLine"] +strip: //dl[@class="storyBlogByline"] +strip: //span[@class='image-credit'] + +prune: no + +test_url: http://www.cbsnews.com/8301-201_162-57366361/rescued-americans-dad-proud-of-the-u.s/ diff --git a/data/GrabberConfig/chareidi.org.txt b/data/GrabberConfig/chareidi.org.txt new file mode 100644 index 00000000..de34a7d8 --- /dev/null +++ b/data/GrabberConfig/chareidi.org.txt @@ -0,0 +1,2 @@ +title: //h1 +test_url: http://www.chareidi.org/archives5772/tetzaveh/TZV72adraft.htm
\ No newline at end of file diff --git a/data/GrabberConfig/chefkoch.de.txt b/data/GrabberConfig/chefkoch.de.txt new file mode 100644 index 00000000..ff251606 --- /dev/null +++ b/data/GrabberConfig/chefkoch.de.txt @@ -0,0 +1,57 @@ +prune: no + +######################################## +# Chefkoch Rezepte +######################################## + +# Set article informations +title: //h1[@class = 'page-title'] +author: //span[@class= 'author'] + +# Content +body: //div[contains(@class, 'main-content')] + +# Cleanup +strip_id_or_class: text-ads-rezeptbild +strip_id_or_class: recipe-buttons +strip_id_or_class: incredientform +strip_id_or_class: js-how2video-container +strip_id_or_class: order-online +strip_id_or_class: mobile-only +strip_id_or_class: text-ads-unter-zubereitung +strip_id_or_class: sharing-wrapper + +test_url: http://www.chefkoch.de/rezepte/1748851284207014/Schweinefilet-mit-Apfel-Curry-Sauce.html +test_contains: Das Schweinefilet trocken tupfen und einmal quer halbieren + + +############################################ +# Chefkoch Magazin +############################################ + +# Article information +title: //h1[contains(@class, 'headline-duo__headline')] +author: //div[@class='article-information__meta']/span[1] + +# Content +body: //div[@class=article-content] + +# Cleanup +strip_id_or_class: slider +strip_id_or_class: recipe-image-voting +strip_id_or_class: recipe-comments +strip_id_or_class: kommentarform +strip_id_or_class: responsive-ad +strip_id_or_class: sg-note +strip_id_or_class: simplora-widget +strip_id_or_class: recipe2shoppinglist +strip_id_or_class: gujAd +strip_id_or_class: teaser-top-small +strip_id_or_class: rezeptvideos +strip_id_or_class: recipe-guide-wrapper +strip_id_or_class: recipe-guide-hint-tile +strip_id_or_class: container-video +strip_id_or_class: recipe2shoppinglist + +test_url: http://www.chefkoch.de/magazin/artikel/6157/Chefkoch/kamelle-selber-machen.html +test_contains: Für viele ist der Umzug am Rosenmontag die Gelegenheit, den Bonbon-Vorrat für das ganze Jahr zu sichern. diff --git a/data/GrabberConfig/chomsky.info.txt b/data/GrabberConfig/chomsky.info.txt new file mode 100644 index 00000000..45df0ba6 --- /dev/null +++ b/data/GrabberConfig/chomsky.info.txt @@ -0,0 +1,6 @@ +title: //*[@class='title'] +author: //*[@class='author'] +prune: no + +test_url: https://chomsky.info/20150120/ +test_contains: The crimes also elicited a flood of commentary diff --git a/data/GrabberConfig/chrisltd.com.txt b/data/GrabberConfig/chrisltd.com.txt new file mode 100644 index 00000000..86d0f5db --- /dev/null +++ b/data/GrabberConfig/chrisltd.com.txt @@ -0,0 +1,6 @@ +title: //header/h1/b[contains(@class, 'title')] +author: substring-after(//article/header/div, 'By ') +date: //header/h1/span[contains(@class, 'date')] +body: //div[@id='main]/article +strip: //header +test_url: http://chrisltd.com/blog/2012/03/fix-widows-indesign/
\ No newline at end of file diff --git a/data/GrabberConfig/christianitytoday.com.txt b/data/GrabberConfig/christianitytoday.com.txt new file mode 100644 index 00000000..86be14ce --- /dev/null +++ b/data/GrabberConfig/christianitytoday.com.txt @@ -0,0 +1,13 @@ +title://div[@class='title'] +author://div[@class='byline']/b +date:substring-after(//div[@class='byline'], 'posted') +body://div[@id='body'] +wrap_in(h2)://span[@class='subhead'] +wrap_in(i)://p[@class='bio'] +wrap_in(i)://p[@class='copyright'] +strip://div[@class='title'] +strip://div[@class='deck'] +strip://div[@class='byline'] +strip://div[@class='copyright'] +strip://br +test_url: http://www.christianitytoday.com/ct/2012/aprilweb-only/my-god-forsaken-me.html
\ No newline at end of file diff --git a/data/GrabberConfig/christies.com.txt b/data/GrabberConfig/christies.com.txt new file mode 100644 index 00000000..b3c76519 --- /dev/null +++ b/data/GrabberConfig/christies.com.txt @@ -0,0 +1,6 @@ +tidy: no +prune: no +date: //article//time[@pubdate] +title: //article/header/h2 +body: //article +test_url: http://www.christies.com/LotFinder/custom/lot_details_MultiLanguage.aspx?from=salesummary&intObjectID=5556662&sid=e536ed1a-b763-41c4-afcf-c94815ec6eee&LID=3
\ No newline at end of file diff --git a/data/GrabberConfig/chrome.google.com.txt b/data/GrabberConfig/chrome.google.com.txt new file mode 100644 index 00000000..5a1d043d --- /dev/null +++ b/data/GrabberConfig/chrome.google.com.txt @@ -0,0 +1,9 @@ +body: //pre[@id='cx-desc-text'] +body: //div[contains(@class, 'overview-tab-right-bar-info')] +title: //h1[contains(@class, 'detail-dialog-title')] +tidy: no +prune: no +replace_string(<noscript>): <div> +replace_string(</noscript>): </div> + +test_url: https://chrome.google.com/webstore/detail/pnaiinchjaonopoejhknmgjingcnaloc
\ No newline at end of file diff --git a/data/GrabberConfig/chronicle.com.txt b/data/GrabberConfig/chronicle.com.txt new file mode 100644 index 00000000..227a03c7 --- /dev/null +++ b/data/GrabberConfig/chronicle.com.txt @@ -0,0 +1,10 @@ +title: //h1[contains(@class, "entry-title")] +author: //p[contains(@class, "byline")] + +# all (?) other articles +body: //div[@class="content-item__story"] +date: //p[contains(@class, "dateline")] + +# note that if you're not a Chronicle subscriber (personally or institutionally), you'll only see the first couple of paragraphs of the article, and Instapaper $ +test_url: http://chronicle.com/article/In-a-Land-of-Second-Chances/128375/ +test_url: http://chronicle.com/blogs/wiredcampus/university-run-boot-camps-offer-students-marketable-skills-but-not-course-credit/57494 diff --git a/data/GrabberConfig/ciaosamin.com.txt b/data/GrabberConfig/ciaosamin.com.txt new file mode 100644 index 00000000..ee09060c --- /dev/null +++ b/data/GrabberConfig/ciaosamin.com.txt @@ -0,0 +1,4 @@ +body://div[contains(@class, 'entry-content')] +date://h2[contains(@class, 'date-header')] +title://h3[contains(@class, 'post-title')] +test_url: http://ciaosamin.com/ciao/2015/12/28/recipe-million-dollar-caramels diff --git a/data/GrabberConfig/cicero.de.txt b/data/GrabberConfig/cicero.de.txt new file mode 100644 index 00000000..b8913639 --- /dev/null +++ b/data/GrabberConfig/cicero.de.txt @@ -0,0 +1,33 @@ +# fforst@... + +# Use link to print article for single page view +single_page_link: //a[@class="print"] + +# set body +tidy: no +body: //div[@class='artikel-content'] + +# strip title and subtitle since we got it already +strip: //div[@class='issue'] +strip: //div[@class='artikel-content']/h2 + +# some authors are known and have a link, others don't +author: //a[contains(@href, 'autor?')] + +#date +date: //span[@class='article-date'] + +# Strip author since we got him +strip_id_or_class: author + +#strip captions +strip_id_or_class: field-name-field-image-credit +strip_id_or_class: field-name-field-article-image-subtitle + +# remove community functions +strip: //div[@class='meta'] +strip: //div[@id='comments'] + +# remove "continue on the next page" text +strip: //p[text()="[SEITE]"] +test_url: http://www.cicero.de/weltbuehne/ihre-wut-ist-global-krise-jugend-revolten-aufstaende-zelte/43049
\ No newline at end of file diff --git a/data/GrabberConfig/cio.com.txt b/data/GrabberConfig/cio.com.txt new file mode 100644 index 00000000..3d053761 --- /dev/null +++ b/data/GrabberConfig/cio.com.txt @@ -0,0 +1,19 @@ +# All sites of the IDG network can be extracted using the same rules, +# make sure to update all of them + +author: //meta[@name="author"]/@content +date: //meta[@name="DC.date.issued"]/@content + +body: //div[@itemprop="articleBody"] +body: //div[@itemprop="reviewBody"] +body: //figcaption|//div[@class="img-wrapper"]/noscript/img + +next_page_link: //a[@rel="next"] + +strip: //aside +strip: //h3[contains(., "See also:")] +strip: //div[@id="article-top-page-number"] +strip: //p[starts-with(normalize-space(.), '[')] +strip: //p[starts-with(normalize-space(.), '+')] + +test_url: http://www.cio.com/article/3167845/social-business/5-things-you-need-to-know-about-snap-s-ipo.html diff --git a/data/GrabberConfig/ciperchile.cl.txt b/data/GrabberConfig/ciperchile.cl.txt new file mode 100644 index 00000000..d7e9b762 --- /dev/null +++ b/data/GrabberConfig/ciperchile.cl.txt @@ -0,0 +1,4 @@ +body: //*[(@id = "articlebody")] +strip_id_or_class: rotulo + +test_url: http://ciperchile.cl/2011/04/18/las-operaciones-secretas-que-ordenaba-karadima-para-aniquilar-a-su-competencia/
\ No newline at end of file diff --git a/data/GrabberConfig/cjr.org.txt b/data/GrabberConfig/cjr.org.txt new file mode 100644 index 00000000..df4c7cc4 --- /dev/null +++ b/data/GrabberConfig/cjr.org.txt @@ -0,0 +1,6 @@ +body: //p[@class='subhead' or @class='attribution'] | //div[@class='article-body'] +prune: no + +single_page_link: //li[@class='print']/a + +test_url: http://www.cjr.org/behind_the_news/from_breaking_news_to_baseless.php
\ No newline at end of file diff --git a/data/GrabberConfig/clientk.com.txt b/data/GrabberConfig/clientk.com.txt new file mode 100644 index 00000000..d5a22ccb --- /dev/null +++ b/data/GrabberConfig/clientk.com.txt @@ -0,0 +1,6 @@ +title://div[@class="entrytitle"]/a +author:substring-after(substring-before(//div[@class="entrytime"], "|"), "By ") +date:substring-before(substring-after(//div[@class="entrytime"], "|"), "- Posted") +body://div[@class="entrybody"] +strip://div[@class="entrybody"]//p[@class="singleinfo"] +test_url: http://clientk.com/2011/12/19/the-impact-of-more/
\ No newline at end of file diff --git a/data/GrabberConfig/cloudacademy.com.txt b/data/GrabberConfig/cloudacademy.com.txt new file mode 100644 index 00000000..f5b67840 --- /dev/null +++ b/data/GrabberConfig/cloudacademy.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fcloudacademy.com%2Fblog%2Fberkshelf-manage-chef-cookbooks%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: http://cloudacademy.com/blog/berkshelf-manage-chef-cookbooks/
\ No newline at end of file diff --git a/data/GrabberConfig/clubic.com.txt b/data/GrabberConfig/clubic.com.txt new file mode 100644 index 00000000..5d35ceff --- /dev/null +++ b/data/GrabberConfig/clubic.com.txt @@ -0,0 +1,17 @@ +title: //h1 +author: //a[@class='auteur'] +body: //div[@class='editorial'] +next_page_link: //a[contains(text(),'Page suivante')] +strip: //a[contains(text(),'Page suivante')] +strip: //a[contains(text(),'Page précédente')] + +body: //div[@id='content'] +strip_id_or_class: breadcrumb +strip_id_or_class: kadmer-ad +strip_id_or_class: slideshow-head +strip_id_or_class: parse-shopper-resume-produit +prune: no + +test_url: http://www.clubic.com/carte-graphique/carte-graphique-amd/radeon-hd-7770/article-478936-1-radeon-hd-7750-7770.html +test_url: http://www.clubic.com/mag/diaporama/photo-space-impressionnantes-photos-atterrissage-barge-84782/ +test_url: http://www.clubic.com/application-mobile/actualite-825698-whatsapp-status-attaque-snapchat.html diff --git a/data/GrabberConfig/cmace.de.txt b/data/GrabberConfig/cmace.de.txt new file mode 100644 index 00000000..701d8f59 --- /dev/null +++ b/data/GrabberConfig/cmace.de.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.cmace.de%2Findex.php%2FThread%2F9780-Urlaub-auf-Madeira-Erlebnisse-und-Empfehlungen%2F + +body: //aside[contains(concat(' ',normalize-space(@class),' '),' member ')] | //div[contains(concat(' ',normalize-space(@class),' '),' messageText ')] +next_page_link: //a[contains(@title, 'Nächste Seite')] +test_url: https://www.cmace.de/index.php/Thread/9780-Urlaub-auf-Madeira-Erlebnisse-und-Empfehlungen/ diff --git a/data/GrabberConfig/cmswire.com.txt b/data/GrabberConfig/cmswire.com.txt new file mode 100644 index 00000000..0b76377a --- /dev/null +++ b/data/GrabberConfig/cmswire.com.txt @@ -0,0 +1,6 @@ +body: //div[contains(@id,'article-body')] +strip://div[contains(@id,'disqus_count_block')] +strip://div[contains(@id,'col-left')] +strip://div[contains(@id,'col-right')] + +test_url: http://www.cmswire.com/cms/customer-experience/for-apps-and-appstores-the-singularity-is-approaching-014888.php
\ No newline at end of file diff --git a/data/GrabberConfig/cn.engadget.com.txt b/data/GrabberConfig/cn.engadget.com.txt new file mode 100644 index 00000000..63f6f7ea --- /dev/null +++ b/data/GrabberConfig/cn.engadget.com.txt @@ -0,0 +1,5 @@ +title: //h2[@class="posttitle"] +body: //div[@class="postbody"] +prune: no + +test_url: http://cn.engadget.com/2013/06/29/google-play-music-all-access/ diff --git a/data/GrabberConfig/cn.reuters.com.txt b/data/GrabberConfig/cn.reuters.com.txt new file mode 100644 index 00000000..28f10472 --- /dev/null +++ b/data/GrabberConfig/cn.reuters.com.txt @@ -0,0 +1,9 @@ +title: //div[@id='maincontent']//h1 +body: //div[@id='resizeableText'] + +single_page_link: concat(//link[@rel='canonical']/@href, '?sp=true') + +test_url: http://cn.reuters.com/article/CNAnalysesNews/idCNKBS0FF0NM20140710 +test_url: http://cn.reuters.feedsportal.com/CNAnalysesNews +# multipage link +test_url: http://cn.reuters.com/article/idCNKBS0FF0UL20140710
\ No newline at end of file diff --git a/data/GrabberConfig/cnet.com.txt b/data/GrabberConfig/cnet.com.txt new file mode 100644 index 00000000..22be3110 --- /dev/null +++ b/data/GrabberConfig/cnet.com.txt @@ -0,0 +1,20 @@ +title: //meta[@property="og:title"]/@content +body: //div[contains(@class, 'postBody')] +date: //div[@id='nameAndTime']/time +author: //div[@id='nameAndTime']/span[@class='author'] + +strip_id_or_class: image-credit +strip_id_or_class: noAutolink +strip_id_or_class: related +strip_id_or_class: sponsor +strip_id_or_class: sharebarWrapper +strip_id_or_class: collections-topics-and-tags + +prune: no +tidy: no + +# early end +replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html> + +test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/ +test_url: http://www.cnet.com/news/firefox-maker-mozilla-we-dont-need-googles-money-anymore/ diff --git a/data/GrabberConfig/cnn.com.txt b/data/GrabberConfig/cnn.com.txt new file mode 100644 index 00000000..013860f5 --- /dev/null +++ b/data/GrabberConfig/cnn.com.txt @@ -0,0 +1,13 @@ +body: //section[contains(@class, 'body-text')] + +strip_id_or_class: highlights + +# Avoid redirecting to 'unsupported browser' page +find_string: <meta http-equiv="refresh" +replace_string: <meta norefresh + +test_url: http://www.cnn.com/2012/05/13/us/new-york-police-policy/index.html +test_contains: this discriminatory and ineffective practice + +test_url: http://rss.cnn.com/rss/edition.rss +test_url: http://rss.cnn.com/rss/edition_technology.rss diff --git a/data/GrabberConfig/code.activestate.com.txt b/data/GrabberConfig/code.activestate.com.txt new file mode 100644 index 00000000..83a21e19 --- /dev/null +++ b/data/GrabberConfig/code.activestate.com.txt @@ -0,0 +1,10 @@ +body: //div[@id='content'] +title: //div[@id='page_header']/h1 + +strip_id_or_class: 'lineno' +strip_id_or_class: 'block-toolbar-button' +strip_id_or_class: 'recipe_score' +strip: //div[@id='recipe_tools'] +strip: //div[@id='addcomment'] + +test_url: http://code.activestate.com/recipes/500261-named-tuples/
\ No newline at end of file diff --git a/data/GrabberConfig/code.google.com.txt b/data/GrabberConfig/code.google.com.txt new file mode 100644 index 00000000..6e9c00a7 --- /dev/null +++ b/data/GrabberConfig/code.google.com.txt @@ -0,0 +1,5 @@ +body: //div[@id="gc-pagecontent"] +strip: //a[@class="backtotop"] +prune: no + +test_url: http://code.google.com/apis/analytics/docs/tracking/gaTrackingEcommerce.html
\ No newline at end of file diff --git a/data/GrabberConfig/codebase64.org.txt b/data/GrabberConfig/codebase64.org.txt new file mode 100644 index 00000000..d992d2f1 --- /dev/null +++ b/data/GrabberConfig/codebase64.org.txt @@ -0,0 +1,9 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fcodebase64.org%2Fdoku.php%3Fid%3Dbase%3Ademo_coding_introduction + +body: //div[contains(concat(' ',normalize-space(@class),' '),' page ')] + +strip_id_or_class: dw__toc + +test_url: http://codebase64.org/doku.php?id=base:demo_coding_introduction diff --git a/data/GrabberConfig/codeproject.com.txt b/data/GrabberConfig/codeproject.com.txt new file mode 100644 index 00000000..d1191acc --- /dev/null +++ b/data/GrabberConfig/codeproject.com.txt @@ -0,0 +1,3 @@ +body: //div[@id="contentdiv"] +date: //span[@class="date"] +test_url: http://www.codeproject.com/Articles/499902/Profiling-Entity-Framework-5-in-code
\ No newline at end of file diff --git a/data/GrabberConfig/codinghorror.com.txt b/data/GrabberConfig/codinghorror.com.txt new file mode 100644 index 00000000..adf6e5a0 --- /dev/null +++ b/data/GrabberConfig/codinghorror.com.txt @@ -0,0 +1,15 @@ +body: //div[@class='blogbody'] +strip: //h3[@class='title'] +date: //h2[@class='date'] +#Should Atwood just be a literal? +author: substring-before( substring-after(//div[@class='posted'], 'y'), 'V') + +# tim.kingman@... 2011-07-26 +# Prune:no to retain all-link ULs that are part of the body content like +# http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html +# Then explicitly strip the "Posted By" and prev/next links that Prune:yes would have removed. + +prune: no +strip: //div[@class='posted']/following-sibling::* +strip: //div[@class='posted'] +test_url: http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html
\ No newline at end of file diff --git a/data/GrabberConfig/coffeecircle.com.txt b/data/GrabberConfig/coffeecircle.com.txt new file mode 100644 index 00000000..423b570b --- /dev/null +++ b/data/GrabberConfig/coffeecircle.com.txt @@ -0,0 +1,8 @@ +tidy: no +prune: no +body: //div[@class='post--title'] | //div[@class='blog-content'] +strip_id_or_class: tag +strip_id_or_class: photoset-grid +strip_id_or_class: newsletter + +test_url: https://www.coffeecircle.com/de/b/cold-brew-eiskaffee diff --git a/data/GrabberConfig/collegehumor.com.txt b/data/GrabberConfig/collegehumor.com.txt new file mode 100644 index 00000000..318e6ff4 --- /dev/null +++ b/data/GrabberConfig/collegehumor.com.txt @@ -0,0 +1,14 @@ +title: //h1[@class='title'] +author: //p[@class='byline']/a[1] +date: //*[@class='date'] + +body: //div[@class='article_body'] +strip: //p[@class='ca_intro'] +strip: //div[@id='action_bar'] +strip: //div[@class='below_content'] +strip: //div[@id='announcement'] +strip: //div[@id='leftovers'] +strip: //div[@class='form'] +strip: //div[@id='email_overlay'] +strip: //a[@class='close'] +test_url: http://www.collegehumor.com/article/6599562/how-it-happened-the-necktie
\ No newline at end of file diff --git a/data/GrabberConfig/commitstrip.com.txt b/data/GrabberConfig/commitstrip.com.txt new file mode 100644 index 00000000..4dd1956c --- /dev/null +++ b/data/GrabberConfig/commitstrip.com.txt @@ -0,0 +1,5 @@ +body: //img[starts-with(@class, 'aligncenter size-full wp-image-')] + +test_url: http://www.commitstrip.com/en/2015/11/10/coder-epitaphs/ +test_url: http://www.commitstrip.com/en/2015/11/12/pay-to-pitch-to-randoms-the-next-big-idea/ +test_url: http://www.commitstrip.com/en/2015/11/13/the-last-ever-line-of-code/ diff --git a/data/GrabberConfig/communities-dominate.blogs.com.txt b/data/GrabberConfig/communities-dominate.blogs.com.txt new file mode 100644 index 00000000..800a907d --- /dev/null +++ b/data/GrabberConfig/communities-dominate.blogs.com.txt @@ -0,0 +1,2 @@ +body: //div[@class="entry-body"] +test_url: http://communities-dominate.blogs.com/brands/2012/03/brutal-truth-about-lumia-cannot-sustain-even-1-to-1-replacement-of-symbian-windows-phone-strategy-do.html
\ No newline at end of file diff --git a/data/GrabberConfig/computerbase.de.txt b/data/GrabberConfig/computerbase.de.txt new file mode 100644 index 00000000..55ec48f2 --- /dev/null +++ b/data/GrabberConfig/computerbase.de.txt @@ -0,0 +1,16 @@ +title://h1 + +author://span[@class="article-authornames"]/a + +body: //div[@class='article-view__content'] + +# this line breaks the parser +#replace_string("padding-bottom:): " + + +strip://div[@class='adbox-wrapper__label'] +strip://div[@class='adbox-rectangle'] +strip://div[@class='adbox-rectangle'] +strip://div[contains(@class,'article-view__share-links-top')] + +test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/ diff --git a/data/GrabberConfig/computerworld.com.txt b/data/GrabberConfig/computerworld.com.txt new file mode 100644 index 00000000..a93446af --- /dev/null +++ b/data/GrabberConfig/computerworld.com.txt @@ -0,0 +1,19 @@ +# All sites of the IDG network can be extracted using the same rules, +# make sure to update all of them + +author: //meta[@name="author"]/@content +date: //meta[@name="DC.date.issued"]/@content + +body: //div[@itemprop="articleBody"] +body: //div[@itemprop="reviewBody"] +body: //figcaption|//div[@class="img-wrapper"]/noscript/img + +next_page_link: //a[@rel="next"] + +strip: //aside +strip: //h3[contains(., "See also:")] +strip: //div[@id="article-top-page-number"] +strip: //p[starts-with(normalize-space(.), '[')] +strip: //p[starts-with(normalize-space(.), '+')] + +test_url: http://www.computerworld.com/article/3054550/data-storage/scientists-could-use-dna-to-shrink-a-data-center-into-a-sugar-cube.html diff --git a/data/GrabberConfig/computerworld.dk.txt b/data/GrabberConfig/computerworld.dk.txt new file mode 100644 index 00000000..d819109c --- /dev/null +++ b/data/GrabberConfig/computerworld.dk.txt @@ -0,0 +1,5 @@ +strip: //div[contains(@class, 'articleAdtechAd')] +title: //div[@id='article']/h1 +title: //div[contains(@class, 'article')]/h1 +body: //div[@id='articleText'] +test_url: http://www.computerworld.dk/art/56748/test-din-viden-med-computerworlds-store-sommerquiz?a=fp_1&i=0
\ No newline at end of file diff --git a/data/GrabberConfig/contexte.com.txt b/data/GrabberConfig/contexte.com.txt new file mode 100644 index 00000000..b3327abe --- /dev/null +++ b/data/GrabberConfig/contexte.com.txt @@ -0,0 +1,29 @@ + +body: //p[contains(concat(' ',normalize-space(@class),' '),' article-header-lead ')] | //div[contains(concat(' ',normalize-space(@class),' '),' article-text ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-container ')] + +author: //p[contains(concat(' ',normalize-space(@class),' '),' article-header-meta ')]//a[1] + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' article-header-title ')] + +strip_id_or_class: ad-container +strip_id_or_class: article-header-title +strip_id_or_class: article-action-label +strip_id_or_class: article-action-labels +strip_id_or_class: article-action-tooltip + +http_header(user-agent): Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0 + +test_url: https://www.contexte.com/article/pouvoirs/le-groupe-lrm-encaisse-les-coups-de-lopposition-dans-affaire-benalla_89898.html + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' sign-up-popin ')] +login_uri: https://www.contexte.com/login/ +login_username_field: username +login_password_field: password +login_extra_fields: next=/ +login_extra_fields: csrfmiddlewaretoken=@=xpath('//input[@name="csrfmiddlewaretoken"]', request_html('https://www.contexte.com/login/')) + diff --git a/data/GrabberConfig/contrepoints.org.txt b/data/GrabberConfig/contrepoints.org.txt new file mode 100644 index 00000000..8a6a1250 --- /dev/null +++ b/data/GrabberConfig/contrepoints.org.txt @@ -0,0 +1,21 @@ +# Contrepoints.org +# As of 2015-04, it's a wordpress-powered website. + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' page-title ')]//span[contains(concat(' ',normalize-space(@class),' '),' inner-text ')] +date: //time[contains(concat(' ',normalize-space(@class),' '),' art-date ')] +author: //h1[contains(concat(' ',normalize-space(@class),' '),' author-name ')] +body: //article[contains(concat(' ',normalize-space(@class),' '),' plain-art ')] + +# no toolbar, meta, etc, but misses excerpt +# body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ')] + +# Thus, we need to strip useless elements from the "plain-art" +strip: //div[contains(concat(' ',normalize-space(@class),' '),' plain-post-topbar ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' single-type-block ')] +strip: //header[contains(concat(' ',normalize-space(@class),' '),' entry-header ')] + +# And no pruning is needed because we stripped unwanted elements. +prune: no + +test_url: http://www.contrepoints.org/2015/04/25/205709-leconomie-selon-ray-dalio +test_url: http://www.contrepoints.org/2015/04/25/205734-huile-et-gaz-de-schiste-revolution-durable
\ No newline at end of file diff --git a/data/GrabberConfig/cookies.frankwatching.com.txt b/data/GrabberConfig/cookies.frankwatching.com.txt new file mode 100644 index 00000000..f50299b7 --- /dev/null +++ b/data/GrabberConfig/cookies.frankwatching.com.txt @@ -0,0 +1,6 @@ +single_page_link: //a[contains(., 'accepteer de cookies')] + +test_url: https://cookies.frankwatching.com/?redirect_to=/archive/2018/01/14/airbnb-stapt-met-hotels-in-de-markt-die-ze-zelf-verstoorde/ +test_contains: Je kon er op wachten + +test_url: https://www.frankwatching.com/feed/ diff --git a/data/GrabberConfig/cooper.com.txt b/data/GrabberConfig/cooper.com.txt new file mode 100644 index 00000000..fc156f7b --- /dev/null +++ b/data/GrabberConfig/cooper.com.txt @@ -0,0 +1,4 @@ +body: //div[contains(@class,'post-body')] +date: //abbr[@class='published'] + +test_url: http://www.cooper.com/journal/2015/6/creating-personas diff --git a/data/GrabberConfig/core77.com.txt b/data/GrabberConfig/core77.com.txt new file mode 100644 index 00000000..cf1fa93c --- /dev/null +++ b/data/GrabberConfig/core77.com.txt @@ -0,0 +1,7 @@ +body: //div[@id="permalink"]/div[@class="post"] + +strip: //div[@id='backArrow'] +strip: //div[@id='fwdArrow'] +strip: //div[@class="post-title"] +strip: //div[@class="sharing"] +test_url: http://www.core77.com/blog/columns/why_design_education_must_change_17993.asp
\ No newline at end of file diff --git a/data/GrabberConfig/counterpunch.org.txt b/data/GrabberConfig/counterpunch.org.txt new file mode 100644 index 00000000..12ca19f7 --- /dev/null +++ b/data/GrabberConfig/counterpunch.org.txt @@ -0,0 +1,7 @@ +title: //div[@class='main']//h1[contains(@class, 'article-title')] +author: //div[@class='mainauthorstyle'] +body: //div[@itemprop="articleBody"] +date: //meta[@itemprop="datePublished"]/@content +#strip: //td[@width='140'] + +test_url: http://www.counterpunch.org/2011/09/28/the-dangerous-cult-of-the-guardian/ diff --git a/data/GrabberConfig/courrierdesbalkans.fr.txt b/data/GrabberConfig/courrierdesbalkans.fr.txt new file mode 100644 index 00000000..c172152c --- /dev/null +++ b/data/GrabberConfig/courrierdesbalkans.fr.txt @@ -0,0 +1,22 @@ + +body://div[@id='content']//div[contains(concat(' ',normalize-space(@class),' '),' main ')] + +author: //div[@id='content']//span[@itemprop='author'] + +strip_id_or_class: barre-outils +strip_id_or_class: barre-sociale +strip_id_or_class: reserveabonnes + +test_url: https://www.courrierdesbalkans.fr/Confrontes-a-une-crise-de-main-d-oeuvre-les-employeurs-du-Banat-recrutent-en + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' formulaire_login ')] +login_uri: https://www.courrierdesbalkans.fr/ +login_username_field: var_login +login_password_field: password +login_extra_fields: formulaire_action=login +login_extra_fields: formulaire_action_args=@=xpath('//input[@name="formulaire_action_args"]', request_html('https://www.courrierdesbalkans.fr/')) + diff --git a/data/GrabberConfig/courrierdeuropecentrale.fr.txt b/data/GrabberConfig/courrierdeuropecentrale.fr.txt new file mode 100644 index 00000000..46d891b3 --- /dev/null +++ b/data/GrabberConfig/courrierdeuropecentrale.fr.txt @@ -0,0 +1,26 @@ + +body://div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] + +author: //header[contains(concat(' ',normalize-space(@class),' '),' entry-header ')]//a[@rel='author'] + +strip_id_or_class: abonnement +strip_id_or_class: authorpage +strip_id_or_class: article-tags +strip_id_or_class: entry-footer + +test_url: https://courrierdeuropecentrale.fr/prague-budapest-differences/ + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +#requires_login: yes +not_logged_in_xpath: //a[@href='https://courrierdeuropecentrale.fr/mon-compte/connexion/'] +login_uri: https://courrierdeuropecentrale.fr/mon-compte/connexion/ +login_username_field: rcp_user_login +login_password_field: rcp_user_pass +login_extra_fields: rcp_action=login +login_extra_fields: rcp_user_remember=1 +login_extra_fields: rcp_redirect=https://courrierdeuropecentrale.fr/mon-compte/connexion/ +login_extra_fields: rcp_login_nonce=@=xpath('//input[@name="rcp_login_nonce"]', request_html('https://courrierdeuropecentrale.fr/mon-compte/connexion/')) + +test_url: https://courrierdeuropecentrale.fr/la-slovaquie-attire-de-plus-en-plus-de-touristes/ diff --git a/data/GrabberConfig/courrierinternational.com.txt b/data/GrabberConfig/courrierinternational.com.txt new file mode 100644 index 00000000..abe61975 --- /dev/null +++ b/data/GrabberConfig/courrierinternational.com.txt @@ -0,0 +1,23 @@ +title: //h1 + +# We can have multiple authors +author: //div[@class='author-name-short'] + +# Publication date +date: //time[@itemprop='datePublished']/@datetime + +body: //div[@id='article-text'] + +prune: no + +requires_login: true +not_logged_in_xpath: //div[@class="box-reserved-abo"] + +login_uri: https://www.courrierinternational.com/login +login_username_field: name +login_password_field: pass +login_extra_fields: form_build_id=@=xpath('//form[@id="user-login-form"]//input[@name="form_build_id"]', request_html('http://www.courrierinternational.com/login')) +login_extra_fields: form_id=user_login_block +login_extra_fields: remember_me=1 + +test_url: http://www.courrierinternational.com/article/securite-informatique-cyberattaque-une-tempete-planetaire diff --git a/data/GrabberConfig/crimemagazine.com.txt b/data/GrabberConfig/crimemagazine.com.txt new file mode 100644 index 00000000..9cf0bccc --- /dev/null +++ b/data/GrabberConfig/crimemagazine.com.txt @@ -0,0 +1,2 @@ +autodetect_next_page: no +test_url: http://www.crimemagazine.com/son-sam
\ No newline at end of file diff --git a/data/GrabberConfig/crimethinc.com.txt b/data/GrabberConfig/crimethinc.com.txt new file mode 100644 index 00000000..b5a8018a --- /dev/null +++ b/data/GrabberConfig/crimethinc.com.txt @@ -0,0 +1,3 @@ +body: //div[@class="readingtext"] +title: substring-after(substring-after(//title, ':'), ':') +test_url: http://www.crimethinc.com/texts/recentfeatures/nightmares.php
\ No newline at end of file diff --git a/data/GrabberConfig/crn.de.txt b/data/GrabberConfig/crn.de.txt new file mode 100644 index 00000000..61d5d6a7 --- /dev/null +++ b/data/GrabberConfig/crn.de.txt @@ -0,0 +1,3 @@ +author: //p[contains(@class,'author')]/a +date: //div[contains(@class,'date')] +test_url: http://www.crn.de/netzwerke-tk/artikel-93103.html
\ No newline at end of file diff --git a/data/GrabberConfig/crunchyroll.com.txt b/data/GrabberConfig/crunchyroll.com.txt new file mode 100644 index 00000000..30c6a443 --- /dev/null +++ b/data/GrabberConfig/crunchyroll.com.txt @@ -0,0 +1,11 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.crunchyroll.com%2Fanime-feature%2F2017%2F03%2F30%2Ffeature-animegame-street-ads-in-akihabara-march-2017 + + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' crunchynews-header ')] +body: //div[@id='template_body'] +date: //span[contains(concat(' ',normalize-space(@class),' '),' post-date ')] +author: //div[contains(concat(' ',normalize-space(@class),' '),' byline ')] +strip: //ul[contains(concat(' ',normalize-space(@class),' '),' news-top-stories-row ')] +test_url: http://www.crunchyroll.com/anime-feature/2017/03/30/feature-animegame-street-ads-in-akihabara-march-2017
\ No newline at end of file diff --git a/data/GrabberConfig/csmonitor.com.txt b/data/GrabberConfig/csmonitor.com.txt new file mode 100644 index 00000000..70ab9885 --- /dev/null +++ b/data/GrabberConfig/csmonitor.com.txt @@ -0,0 +1,18 @@ +title: //h1[contains(@class, 'head')] + +# standard page +body: //div[@id='mainColumn']//div[contains(@class, 'list-article-full')] +# print page +body: //div[@id='mainColumn'] + +author: //a[contains(@class, 'ui-author')] + +single_page_link: //div[@class='storyToolbar']//a[contains(@href, '/print/')] + +strip_id_or_class: storyToolbar +strip_id_or_class: promotion-tag + +tidy: no +prune: no + +test_url: http://www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84 diff --git a/data/GrabberConfig/csnphilly.com.txt b/data/GrabberConfig/csnphilly.com.txt new file mode 100644 index 00000000..c14a934a --- /dev/null +++ b/data/GrabberConfig/csnphilly.com.txt @@ -0,0 +1,22 @@ +# author's name is not isolated as a tag.... ugh +convert_double_br_tags: yes +body: //csn_blogST_main + +#junk above and around the article +strip: /html/body/div[4]/div[3]/div/div/div/section/div/div/div/div/div/div +strip: /html/body/div[4]/header +strip_id_or_class: article-right-sidebar +strip_id_or_class: rsn-gigya-sharebar-container +strip_id_or_class: article-bottom +strip_id_or_class: hider +strip_id_or_class: footer +strip_id_or_class: masthead +strip_id_or_class: block-menu-menu-rsn-login-or-register +strip_id_or_class: block-menu-menu-header-links +strip_id_or_class: block-rsn-follow-bar-follow-bar +strip_id_or_class: block-rsn-weather-rsn-weather-scoreboard +strip_id_or_class: logo +strip_id_or_class: element-invisible +strip_id_or_class: site-name +strip: //div[contains(@style, 'none')] +test_url: http://www.csnphilly.com/eagles/can-stoutland-save-danny-watkins-career
\ No newline at end of file diff --git a/data/GrabberConfig/csoonline.com.txt b/data/GrabberConfig/csoonline.com.txt new file mode 100644 index 00000000..1ae24b90 --- /dev/null +++ b/data/GrabberConfig/csoonline.com.txt @@ -0,0 +1,19 @@ +# All sites of the IDG network can be extracted using the same rules, +# make sure to update all of them + +author: //meta[@name="author"]/@content +date: //meta[@name="DC.date.issued"]/@content + +body: //div[@itemprop="articleBody"] +body: //div[@itemprop="reviewBody"] +body: //figcaption|//div[@class="img-wrapper"]/noscript/img + +next_page_link: //a[@rel="next"] + +strip: //aside +strip: //h3[contains(., "See also:")] +strip: //div[@id="article-top-page-number"] +strip: //p[starts-with(normalize-space(.), '[')] +strip: //p[starts-with(normalize-space(.), '+')] + +test_url: http://www.csoonline.com/article/3168357/security/windows-trojan-hacks-into-embedded-devices-to-install-mirai.html diff --git a/data/GrabberConfig/css-tricks.com.txt b/data/GrabberConfig/css-tricks.com.txt new file mode 100644 index 00000000..16b4bd78 --- /dev/null +++ b/data/GrabberConfig/css-tricks.com.txt @@ -0,0 +1,5 @@ +title: //h1 +author: //a[contains(@href, "css-tricks.com/author")] +body: //div[@class="article-content"] +strip: //div[contains(@class, "sharedaddy")] +test_url: https://css-tricks.com/using-custom-properties-modify-components/ diff --git a/data/GrabberConfig/cucharasonica.com.txt b/data/GrabberConfig/cucharasonica.com.txt new file mode 100644 index 00000000..e691fe83 --- /dev/null +++ b/data/GrabberConfig/cucharasonica.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://cucharasonica.com/2011/09/queen-busca-candidatos-para-su-propia-banda-tributo
\ No newline at end of file diff --git a/data/GrabberConfig/cultofmac.com.txt b/data/GrabberConfig/cultofmac.com.txt new file mode 100644 index 00000000..ea812c12 --- /dev/null +++ b/data/GrabberConfig/cultofmac.com.txt @@ -0,0 +1,8 @@ +strip_id_or_class: tags +strip_id_or_class: post-more +strip_id_or_class: sidebar +strip_id_or_class: social-container + +author: //a[@rel='author'] + +test_url: https://www.cultofmac.com/568215/eu-may-force-iphone-to-switch-from-lightning-to-usb/ diff --git a/data/GrabberConfig/culturebd.com.txt b/data/GrabberConfig/culturebd.com.txt new file mode 100644 index 00000000..da6582cf --- /dev/null +++ b/data/GrabberConfig/culturebd.com.txt @@ -0,0 +1,41 @@ + +author: //div[@itemprop='author']//p[@itemprop='name'] + +body: //section[contains(concat(' ',normalize-space(@class),' '),' content ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' content ')] + +prune: no + +strip_id_or_class: social +strip_id_or_class: follow +strip_id_or_class: page-up +strip_id_or_class: see-more +strip_id_or_class: author-box +strip_id_or_class: mobileHide +strip_id_or_class: h-alb-titre +strip_id_or_class: listComments +strip_id_or_class: share-buttons +strip_id_or_class: comment-form-bottom +strip_id_or_class: biblio-action-buttons +strip: //h2[text()='Pour aller plus loin']/following-sibling::div[contains(concat(' ',normalize-space(@class),' '),' panel ')] +strip: //h2[text()='Pour aller plus loin'] +strip: //h2[@id="cbdReview"] +strip: //nav + +# book serie: +test_url: https://culturebd.com/serie-17441-dept.-h + +# book review: +test_url: https://culturebd.com/album-bd/les-enquetes-du-commissaire-raffini/230936-12-rue-des-souvenirs + +# interview:: +test_url: https://culturebd.com/actualites/interviews/2018/07/331-wesh-caribou-l-art-du-rire-par-grand-froid + +# anecdote: +test_url: https://culturebd.com/anecdotes/the-end-l-apocalypse-selon-les-doors-255 + +# author: +test_url: https://culturebd.com/auteurs/110-rodolphe + +# agenda: +test_url: https://culturebd.com/agenda/exposition-chapeau-bas-spirou-676 diff --git a/data/GrabberConfig/cw.com.tw.txt b/data/GrabberConfig/cw.com.tw.txt new file mode 100644 index 00000000..6e3a91ee --- /dev/null +++ b/data/GrabberConfig/cw.com.tw.txt @@ -0,0 +1,14 @@ +author://span[contains(@class,'reporter')] + +date://span[contains(@class,'date')] + +body://div[contains(@class,'mainContaner')] + +strip://div[contains(@class,'mainHeaer')] +strip://div[contains(@class,'keyW')] +strip://div[contains(@class,'wonderful')] +strip://div[contains(@class,'pages')] +strip://div[contains(@class,'Topics TopicsW3')] + +next_page_link://li[@class='pageNext']/a[contains(.,'下一頁')] +test_url: http://www.cw.com.tw/article/article.action?id=5032848
\ No newline at end of file diff --git a/data/GrabberConfig/cwnp.com.txt b/data/GrabberConfig/cwnp.com.txt new file mode 100644 index 00000000..169fdf84 --- /dev/null +++ b/data/GrabberConfig/cwnp.com.txt @@ -0,0 +1,14 @@ +title: //div[@class='entry-pad']//h2 +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-pad ')] +strip: //h1 +strip: //p +strip: //h2 +strip: //div[@class='clear'] + +prune: no +tidy: no + +autodetect_on_failure: no + +test_url: https://www.cwnp.com/wotd.php +test_url: https://www.cwnp.com/qotd.php diff --git a/data/GrabberConfig/cyrille-borne.com.txt b/data/GrabberConfig/cyrille-borne.com.txt new file mode 100644 index 00000000..c40b1778 --- /dev/null +++ b/data/GrabberConfig/cyrille-borne.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fcyrille-borne.com%2Fpluxml%2Findex.php%3Farticle144%2Fpourquoi-linux + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article ')] +test_url: https://cyrille-borne.com/pluxml/index.php?article144/pourquoi-linux
\ No newline at end of file diff --git a/data/GrabberConfig/da.feedsportal.com.txt b/data/GrabberConfig/da.feedsportal.com.txt new file mode 100644 index 00000000..2bd66be8 --- /dev/null +++ b/data/GrabberConfig/da.feedsportal.com.txt @@ -0,0 +1,5 @@ +single_page_link: //a +tidy: no +prune: no + +test_url: http://da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm diff --git a/data/GrabberConfig/dadall.info.txt b/data/GrabberConfig/dadall.info.txt new file mode 100644 index 00000000..d02b7d13 --- /dev/null +++ b/data/GrabberConfig/dadall.info.txt @@ -0,0 +1,6 @@ +title: //article//header//h1 +author: //meta[@name="author"]/@content +date: //time//@datetime + +body: //article//section +test_url: https://www.dadall.info/article611/message-de-service diff --git a/data/GrabberConfig/dagogtid.no.txt b/data/GrabberConfig/dagogtid.no.txt new file mode 100644 index 00000000..1531472c --- /dev/null +++ b/data/GrabberConfig/dagogtid.no.txt @@ -0,0 +1,4 @@ +title: //span[@class = 'overskriftEkstrastor'] +author: //em/a + +test_url: http://dagogtid.no/nyhet.cfm?nyhetid=2414
\ No newline at end of file diff --git a/data/GrabberConfig/dailydot.com.txt b/data/GrabberConfig/dailydot.com.txt new file mode 100644 index 00000000..978ed1ce --- /dev/null +++ b/data/GrabberConfig/dailydot.com.txt @@ -0,0 +1,4 @@ +tidy: no +body: //article + +test_url: http://www.dailydot.com/entertainment/tumblr-christopher-price-topherchris/
\ No newline at end of file diff --git a/data/GrabberConfig/dailykos.com.txt b/data/GrabberConfig/dailykos.com.txt new file mode 100644 index 00000000..6d4cb82a --- /dev/null +++ b/data/GrabberConfig/dailykos.com.txt @@ -0,0 +1,10 @@ +body: //div[@id='article-1']//div[contains(@class, 'article-body')] +title: //div[@class='meta']//a[@id='titleHref'] +date: //div[@class='meta']//p[@class='date'] + +strip_id_or_class: invisible +strip_id_or_class: divider-doodle + +prune: no + +test_url: http://www.dailykos.com/story/2012/01/26/1058790/-Newt-Gingrich-s-campaign-admits-he-lied-during-debate-about-ABC-News-interview-with-his-ex-wife diff --git a/data/GrabberConfig/dailymail.co.uk.txt b/data/GrabberConfig/dailymail.co.uk.txt new file mode 100644 index 00000000..8535b19f --- /dev/null +++ b/data/GrabberConfig/dailymail.co.uk.txt @@ -0,0 +1,21 @@ +body: //div[@id='js-article-text'] +strip: //div[@class='explore-links'] +strip: //div[@id='js-article-text']/br[position()=1] +strip_id_or_class: print-or-mail-links +strip_id_or_class: shareArticles +strip_id_or_class: googleAds +strip_id_or_class: digg-button +strip_id_or_class: article-icon-links-container +strip_id_or_class: clickToEnlarge +strip_id_or_class: articleIconLinksContainer +strip_id_or_class: related-carousel +strip_id_or_class: reader-comments +strip_id_or_class: most-watched +strip_id_or_class: most-read + +find_string:blkBorder img-share +replace_string: nothing + +tidy: no + +test_url: http://www.dailymail.co.uk/news/article-1375423/Royal-wedding-Texan-billionaire-Joe-Albritton-invited-Prince-Charles.html
\ No newline at end of file diff --git a/data/GrabberConfig/dailymotion.com.txt b/data/GrabberConfig/dailymotion.com.txt new file mode 100644 index 00000000..6253325f --- /dev/null +++ b/data/GrabberConfig/dailymotion.com.txt @@ -0,0 +1,12 @@ +title: //title +body: //iframe + +replace_string(<![CDATA[): _ +replace_string(]]>): _ + +single_page_link: //link[@type='application/xml+oembed'] + +prune: no +tidy: no + +test_url: http://www.dailymotion.com/video/x1vk5oh_before-they-were-on-game-of-thrones_people diff --git a/data/GrabberConfig/dailynord.fr.txt b/data/GrabberConfig/dailynord.fr.txt new file mode 100644 index 00000000..900564d3 --- /dev/null +++ b/data/GrabberConfig/dailynord.fr.txt @@ -0,0 +1,37 @@ + +author: //a[@rel='author'] + +title: //section[@id='pagearticle']/h1[1] + +body: //section[@id='pagearticle'] + +strip: //*[@id='comments']/following-sibling::* +strip: //*[@id='comments']/following-sibling::text() +strip: //*[@id='respond']/following-sibling::* +strip: //*[@id='respond']/following-sibling::text() +strip: //*[@id='comments'] +strip: //*[@id='respond'] +strip_id_or_class: toptitlepages +strip_id_or_class: rcp_paid_only +strip_id_or_class: rcp_restricted +strip_id_or_class: relpost-thumb-wrapper +strip_id_or_class: really_simple_share +strip_id_or_class: copyright +strip_id_or_class: reply +strip_id_or_class: cancel-comment-reply +strip_id_or_class: commentform + +test_url: https://dailynord.fr/2018/07/lextraordinaire-oeuvre-de-sergex-ne-a-mericourt/ + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //form[@id='rcp_login_form'] +login_uri: https://dailynord.fr/connexion-a-mon-espace-abonne/ +login_username_field: rcp_user_login +login_password_field: rcp_user_pass +login_extra_fields: rcp_login_nonce=@=xpath('//form[@id="rcp_login_form"]//input[@name="rcp_login_nonce"]', request_html('https://dailynord.fr/connexion-a-mon-espace-abonne/')) +login_extra_fields: rcp_user_remember=1 +login_extra_fields: rcp_action=login +login_extra_fields: rcp_redirect=https://dailynord.fr/connexion-a-mon-espace-abonne/ diff --git a/data/GrabberConfig/dailysabah.com.txt b/data/GrabberConfig/dailysabah.com.txt new file mode 100644 index 00000000..67b108b6 --- /dev/null +++ b/data/GrabberConfig/dailysabah.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.dailysabah.com%2Feurope%2F2015%2F11%2F09%2Fcatalan-parliament-votes-to-secede-from-spain-by-2017 + +body: //div[@id='newsDescription']//div[contains(concat(' ',normalize-space(@class),' '),' txt ')] +test_url: http://www.dailysabah.com/europe/2015/11/09/catalan-parliament-votes-to-secede-from-spain-by-2017 diff --git a/data/GrabberConfig/dailystar.com.lb.txt b/data/GrabberConfig/dailystar.com.lb.txt new file mode 100644 index 00000000..3b153042 --- /dev/null +++ b/data/GrabberConfig/dailystar.com.lb.txt @@ -0,0 +1,6 @@ +title: //div[@class='ec-blog-headline'] +body: //*[@id="divDetails"] +date: //*[@id="ctl00_ContentPlaceHolder1_tdDate"] +author: //*[@id="ctl00_ContentPlaceHolder1_anchorAuthor"]/a +autodetect_next_page: no +test_url: http://dailystar.com.lb/Opinion/Columnist/2012/Oct-10/190803-americas-new-modesty-in-the-mideast.ashx#axzz2928JP5xE
\ No newline at end of file diff --git a/data/GrabberConfig/dansdata.com.txt b/data/GrabberConfig/dansdata.com.txt new file mode 100644 index 00000000..60669480 --- /dev/null +++ b/data/GrabberConfig/dansdata.com.txt @@ -0,0 +1,5 @@ +autodetect_next_page: no +tidy: no +prune: no +body: //div[@class='NoOverflow'] +test_url: http://www.dansdata.com/gz129.htm
\ No newline at end of file diff --git a/data/GrabberConfig/dantri.com.vn.txt b/data/GrabberConfig/dantri.com.vn.txt new file mode 100644 index 00000000..f19fee7c --- /dev/null +++ b/data/GrabberConfig/dantri.com.vn.txt @@ -0,0 +1,7 @@ +title: //h1[contains(@class, 'fon31 mt2')] +body: //h2[contains(@class, 'fon33 mt1')] | //div[contains(@class, 'fon34 mt3')] + +prune: no + +test_url: http://dantri.com.vn/su-kien/chang-trai-mot-minh-dap-xe-vuot-450km-de-vieng-mo-dai-tuong-869763.htm +test_url: http://dantri.com.vn/trangchu.rss
\ No newline at end of file diff --git a/data/GrabberConfig/daringfireball.net.txt b/data/GrabberConfig/daringfireball.net.txt new file mode 100644 index 00000000..251cc670 --- /dev/null +++ b/data/GrabberConfig/daringfireball.net.txt @@ -0,0 +1,7 @@ +title: //div[@class="article"]/h1 +author: //div[@id="Sidebar"]/p/strong +date: //h6[@class="dateline"] +body: //div[@class="article"] +strip: //h6[@class="dateline"] +strip: //div[@class="article"]/h1 +test_url: http://daringfireball.net/2011/10/apps_are_the_new_channels
\ No newline at end of file diff --git a/data/GrabberConfig/daserste.ndr.de.txt b/data/GrabberConfig/daserste.ndr.de.txt new file mode 100644 index 00000000..1ad3056d --- /dev/null +++ b/data/GrabberConfig/daserste.ndr.de.txt @@ -0,0 +1,8 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fdaserste.ndr.de%2Fpanorama%2Faktuell%2FChronik-Rechtsextreme-Vorfaelle-in-der-AfD-2016%2Cafd892.html + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' headline ')] +next_page_link: //li[@class='next']/a +body: //div[contains(concat(' ',normalize-space(@class),' '),' modCon ')]//div[contains(concat(' ',normalize-space(@class),' '),' mod ') and (contains(concat(' ',normalize-space(@class),' '),' modA ')) and (contains(concat(' ',normalize-space(@class),' '),' modParagraph '))]//div[contains(concat(' ',normalize-space(@class),' '),' boxCon ')]//div[contains(concat(' ',normalize-space(@class),' '),' box ')] +test_url: http://daserste.ndr.de/panorama/aktuell/Chronik-Rechtsextreme-Vorfaelle-in-der-AfD-2016,afd892.html diff --git a/data/GrabberConfig/dbazi.com.txt b/data/GrabberConfig/dbazi.com.txt new file mode 100644 index 00000000..c17adcd0 --- /dev/null +++ b/data/GrabberConfig/dbazi.com.txt @@ -0,0 +1,3 @@ +body: //div[contains(@class, 'the-content') or contains(@class, 'featured-image-inner')] + +test_url: http://www.dbazi.com/1395/12/07/%D8%A8%D8%A7%D8%B2%DB%8C%E2%80%8C%D9%87%D8%A7%DB%8C-%D8%A2%DB%8C%D9%86%D8%AF%D9%87%E2%80%8C%DB%8C-%D9%85%D8%A7%D8%B1%D9%88%D9%84-%D9%88%D8%A7%D8%A8%D8%B3%D8%AA%D9%87-%D8%A8%D9%87-%DA%A9%D8%AA%D8%A7/ diff --git a/data/GrabberConfig/dcurt.is.txt b/data/GrabberConfig/dcurt.is.txt new file mode 100644 index 00000000..524c4bf1 --- /dev/null +++ b/data/GrabberConfig/dcurt.is.txt @@ -0,0 +1,8 @@ +title: (//article//h2)[1] +body: //article[contains(@class, 'post')] +date: //time[@id='top_time']/@datetime + +prune: no +tidy: no + +test_url: http://dcurt.is/predictions-txt
\ No newline at end of file diff --git a/data/GrabberConfig/deadline.com.txt b/data/GrabberConfig/deadline.com.txt new file mode 100644 index 00000000..782bdeea --- /dev/null +++ b/data/GrabberConfig/deadline.com.txt @@ -0,0 +1,11 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fdeadline.com%2F2016%2F03%2Fsteven-spielberg-harrison-ford-team-up-for-indiana-jones-5-disney-sets-july-2019-release-1201720725%2F + +body: //article[contains(concat(' ',normalize-space(@class),' '),' post ')] +strip_id_or_class: pad +strip_id_or_class: recent-comments +strip_id_or_class: tags +strip_id_or_class: copyrighta + +test_url: http://deadline.com/2016/03/steven-spielberg-harrison-ford-team-up-for-indiana-jones-5-disney-sets-july-2019-release-1201720725/ diff --git a/data/GrabberConfig/deadspin.com.txt b/data/GrabberConfig/deadspin.com.txt new file mode 100644 index 00000000..190b3c20 --- /dev/null +++ b/data/GrabberConfig/deadspin.com.txt @@ -0,0 +1,4 @@ +http_header(user-agent): PHP/5.3 + +test_url: http://deadspin.com/actually-extra-innings-should-stay-just-as-they-are-1792189890 +test_contains: Imagine watching the last postseason diff --git a/data/GrabberConfig/deia.com.txt b/data/GrabberConfig/deia.com.txt new file mode 100644 index 00000000..d2f78239 --- /dev/null +++ b/data/GrabberConfig/deia.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='widget full_article'] +strip: //div[@class='Herramientas'] + +test_url: http://www.deia.com/2015/10/03/politica/euskadi/el-envilecimiento-de-la-politica-y-de-los-medios-de-comunicacion diff --git a/data/GrabberConfig/delong.typepad.com.txt b/data/GrabberConfig/delong.typepad.com.txt new file mode 100644 index 00000000..c4b922e4 --- /dev/null +++ b/data/GrabberConfig/delong.typepad.com.txt @@ -0,0 +1,4 @@ +strip_id_or_class: banner +strip_id_or_class: gamma +strip_id_or_class: module-list +test_url: http://delong.typepad.com/sdj/2011/02/in-which-suresh-naidu-visits-the-new-jerusalem.html
\ No newline at end of file diff --git a/data/GrabberConfig/democracynow.org.txt b/data/GrabberConfig/democracynow.org.txt new file mode 100644 index 00000000..b0050b4f --- /dev/null +++ b/data/GrabberConfig/democracynow.org.txt @@ -0,0 +1,5 @@ +body: //div[contains(@class, 'blog_body')] + +prune: no + +test_url: http://www.democracynow.org/blog/2014/1/9/the_fbi_the_nsa_and_a
\ No newline at end of file diff --git a/data/GrabberConfig/denikn.cz.txt b/data/GrabberConfig/denikn.cz.txt new file mode 100644 index 00000000..2ecd6c01 --- /dev/null +++ b/data/GrabberConfig/denikn.cz.txt @@ -0,0 +1,16 @@ +title: //h2[contains(concat(' ',normalize-space(@class),' '),' e_title ')] +author: //cite[contains(concat(' ',normalize-space(@class),' '),' e_author_t ')] +date: //time[@class='e_terms_posted'] +strip: //article[contains(concat(' ',normalize-space(@class),' '),' a_art__link ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' t_thankyou ')]//p +body: //div[contains(concat(' ',normalize-space(@class),' '),' b_single_e ')]//p | //div[contains(concat(' ',normalize-space(@class),' '),' a_single__post ')] + +requires_login: yes + +login_uri: https://predplatne.denikn.cz/api/v1/users/login/ +login_username_field: email +login_password_field: password + +not_logged_in_xpath: //form[@class="s_h_usr_login"] + +test_url: https://denikn.cz/3186/jsem-vnitrne-vyhorely-mam-odpracovano-cunek-je-stary-provokater-rika-belobradek/ diff --git a/data/GrabberConfig/der-postillon.com.txt b/data/GrabberConfig/der-postillon.com.txt new file mode 100644 index 00000000..a53f1b38 --- /dev/null +++ b/data/GrabberConfig/der-postillon.com.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.der-postillon.com%2F2013%2F03%2Fpornodarsteller-arbeitet-ein-jahr-lang.html + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-body ')] + +test_url: http://www.der-postillon.com/2013/03/pornodarsteller-arbeitet-ein-jahr-lang.html diff --git a/data/GrabberConfig/derbund.ch.txt b/data/GrabberConfig/derbund.ch.txt new file mode 100644 index 00000000..1363eff6 --- /dev/null +++ b/data/GrabberConfig/derbund.ch.txt @@ -0,0 +1,13 @@ +# Author: cirnod@gmail.com + +tidy: no +prune: no + +body: //div[@id="article"]/h3 | //*[@id="mainContent"] + +# General Cleanup +#strip_id_or_class: info_panel + + +# Try yourself +test_url: http://www.derbund.ch/bern/nachrichten/Fossilienforscher-stehen-auf-Heavy-Metal/story/20919522 diff --git a/data/GrabberConfig/derstandard.at.txt b/data/GrabberConfig/derstandard.at.txt new file mode 100644 index 00000000..4f915919 --- /dev/null +++ b/data/GrabberConfig/derstandard.at.txt @@ -0,0 +1,29 @@ +title: //div[@id='content-header']/h1 +author: //span[@class='author'] +body: //div[@id='objectContent'] +strip: //ul[@class='lookupLinksArtikel'] + +strip: //meta +strip: //div[@itemprop='publisher'] +strip: //div[@id='content-header'] +strip: //div[@id='pageTop'] +strip: //div[@id='toolbar'] +strip: //div[@id='articleTools'] +strip: //div[@id='weiterLesen'] +strip: //div[@id='communityCanvas'] +strip: //div[@class='credits'] +strip: //div[@id='feature-cover'] +strip: //div[@id='feature-meta'] +strip: //li[@class='empty'] +strip: //ul[@class='lookup-links'] + +http_header(Cookie): DSGVO_ZUSAGE_V1=true + +prune: no +tidy: no + +test_url: http://derstandard.at/1318726018343/Breitband-LTE-Was-bringt-die-neue-Mobilfunk-Generation +test_url: http://derstandard.at/2000033602592/40-Jahre-fuer-einen-der-nicht-gemeinsam-leben-wollte +test_url: http://derstandard.at/2000017141382/Feature-Format +test_url: http://derstandard.at/2000031826169/Die-Mittagspause-ist-den-meisten-heilig +test_url: http://derstandard.at/2000032076109/Fordern-wir-die-totale-Ueberwachung diff --git a/data/GrabberConfig/des-livres-pour-changer-de-vie.fr.txt b/data/GrabberConfig/des-livres-pour-changer-de-vie.fr.txt new file mode 100644 index 00000000..56b0ea27 --- /dev/null +++ b/data/GrabberConfig/des-livres-pour-changer-de-vie.fr.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.des-livres-pour-changer-de-vie.fr%2Fbit-literacy-2%2F + +body: //div[@id='getsocialmain'] +test_url: http://www.des-livres-pour-changer-de-vie.fr/bit-literacy-2/
\ No newline at end of file diff --git a/data/GrabberConfig/designsponge.com.txt b/data/GrabberConfig/designsponge.com.txt new file mode 100644 index 00000000..0fd69b1b --- /dev/null +++ b/data/GrabberConfig/designsponge.com.txt @@ -0,0 +1,31 @@ +# Author: zinnober + +tidy: no +prune: no + +# Set title +title: //header/h1 + +# Set author +author: //a[@rel='author'] + +# Content is here +body: //article + +# Tidy up before article +strip: //header + +# Tidy up article +strip: //div[contains(@id, 'gallery-')] +replace_string(<a rel="attachment): <p rel="attachment + + +# Tidy up after article +strip: //div[@class='sm'] +strip_id_or_class: related +strip_id_or_class: comments +strip: //footer + +# Try it yourself +test_url: http://www.designsponge.com/2010/06/seattle-design-guide.html +test_url: http://www.designsponge.com/2012/04/sneak-peek-liz-cook.html diff --git a/data/GrabberConfig/designtagebuch.de.txt b/data/GrabberConfig/designtagebuch.de.txt new file mode 100644 index 00000000..9020847f --- /dev/null +++ b/data/GrabberConfig/designtagebuch.de.txt @@ -0,0 +1,11 @@ +tidy: no +body: //div[@class='main'] + +author: substring-before(substring-after(//div[@class='meta-single'], 'erstellt von '), ' am') +date: substring-before(substring-after(//div[@class='meta-single'], ' am '), ' | ') + +strip_id_or_class: pagelink +strip_id_or_class: wp-polls + +next_page_link: //div[@class='post-page-next']/a +test_url: http://www.designtagebuch.de/die-gefuehlte-lesbarkeit/
\ No newline at end of file diff --git a/data/GrabberConfig/deutsche-apotheker-zeitung.de.txt b/data/GrabberConfig/deutsche-apotheker-zeitung.de.txt new file mode 100644 index 00000000..36709cab --- /dev/null +++ b/data/GrabberConfig/deutsche-apotheker-zeitung.de.txt @@ -0,0 +1,29 @@ +# Author: zinnober + +prune: yes +tidy: yes + +title: //h1 +date: //p[@class='news_datum'] +author: //span[@class='author'] + +body: //div[@class='tagesnews-content'] + +# General clenaup +strip_id_or_class: dachzeile +strip: //h3 +strip: //p[@class='bodytext']//a +strip_id_or_class: autor_datum +strip_id_or_class: comments +strip_id_or_class: banner- + +strip: //p[contains(., 'Lesen Sie')] +strip: //p[contains(., '– in DAZ')] + +# Fix image captions +replace_string(<p class="image_caption">): <p><small><em> +replace_string(</dd>): </em></small></dd> + +test_url: http://www.deutsche-apotheker-zeitung.de/pharmazie/news/2014/09/03/weniger-nebenwirkungen-aber-kein-zusatznutzen/13715.html +test_url: http://www.deutsche-apotheker-zeitung.de/recht/news/2014/09/02/urteile-zum-cannabis-eigenanbau-bfarm-geht-in-berufung/13716.html + diff --git a/data/GrabberConfig/developers.facebook.com.txt b/data/GrabberConfig/developers.facebook.com.txt new file mode 100644 index 00000000..7609b72f --- /dev/null +++ b/data/GrabberConfig/developers.facebook.com.txt @@ -0,0 +1,3 @@ +title: //div[@class="bodyText"]/h1 +author: //div[@class="picture"]/a/img/@alt +test_url: https://developers.facebook.com/blog/post/2012/03/22/developer-spotlight--foodspotting/
\ No newline at end of file diff --git a/data/GrabberConfig/devlinsangle.blogspot.co.at.txt b/data/GrabberConfig/devlinsangle.blogspot.co.at.txt new file mode 100644 index 00000000..6f1d4e27 --- /dev/null +++ b/data/GrabberConfig/devlinsangle.blogspot.co.at.txt @@ -0,0 +1,6 @@ +date: //h2[@class='date-header'] +body: //div[@class='post hentry'] +title: //h3 +strip: //div[@class='post-footer'] + +test_url: http://devlinsangle.blogspot.co.at/2012/03/difference-between-teaching-and_01.html
\ No newline at end of file diff --git a/data/GrabberConfig/diagonalperiodico.net.txt b/data/GrabberConfig/diagonalperiodico.net.txt new file mode 100644 index 00000000..f0681bac --- /dev/null +++ b/data/GrabberConfig/diagonalperiodico.net.txt @@ -0,0 +1,4 @@ +body: //div[@class='field__items'] +title: //div[@class='art_titulo'] + +test_url: https://www.diagonalperiodico.net/global/27947-cuanto-mas-tiempo-nos-aferremos-este-sistema-peor-y-menores-seran-nuestras-opciones diff --git a/data/GrabberConfig/dictionary.reference.com.txt b/data/GrabberConfig/dictionary.reference.com.txt new file mode 100644 index 00000000..b8243d0c --- /dev/null +++ b/data/GrabberConfig/dictionary.reference.com.txt @@ -0,0 +1,6 @@ +body: //div[contains(@class, 'source-data')] +strip: //button + +prune: no + +test_url: http://dictionary.reference.com/browse/propaganda diff --git a/data/GrabberConfig/diepresse.com.txt b/data/GrabberConfig/diepresse.com.txt new file mode 100644 index 00000000..ced189cc --- /dev/null +++ b/data/GrabberConfig/diepresse.com.txt @@ -0,0 +1,6 @@ +title: //div[@class='article']/h1 +date: substring-before(//p[@class='articletime'],'|') +body: //div[@id='articletext'] +strip: //div[@class='inlineDiashow'] + +test_url: http://diepresse.com/home/politik/aussenpolitik/701905/TibeterProteste_Nonne-verbrennt-sich-selbst?_vl_backlink=/home/politik/index.do
\ No newline at end of file diff --git a/data/GrabberConfig/digiphoto.techbang.com.txt b/data/GrabberConfig/digiphoto.techbang.com.txt new file mode 100644 index 00000000..80ce5ff3 --- /dev/null +++ b/data/GrabberConfig/digiphoto.techbang.com.txt @@ -0,0 +1,8 @@ +# default parser works great +# only add "author" and "next page link" reference +# 2012-04-13 + +next_page_link: //div[@class = 'pagination']/a[@class = 'next_page'] + +author: //*[@class = 'author metadata']/a +test_url: http://digiphoto.techbang.com/posts/2433--commercial-photography-communication-is-the-key-to-a-good-work
\ No newline at end of file diff --git a/data/GrabberConfig/digital-photography-school.com.txt b/data/GrabberConfig/digital-photography-school.com.txt new file mode 100644 index 00000000..18ce370e --- /dev/null +++ b/data/GrabberConfig/digital-photography-school.com.txt @@ -0,0 +1,6 @@ +title: //div[@class='post-title']/h1 +author: //a[@href='#author'] +body: //div[@class='post-content'] +strip: //div[@class='post-meta'] + +test_url: http://www.digital-photography-school.com/10-ways-to-develop-yourself-photographically
\ No newline at end of file diff --git a/data/GrabberConfig/digitalforensics.com.txt b/data/GrabberConfig/digitalforensics.com.txt new file mode 100644 index 00000000..7fc31c4e --- /dev/null +++ b/data/GrabberConfig/digitalforensics.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.digitalforensics.com%2Fblog%2Fextracting-whatsapp-database-and-the-cipher-key-from-a-non-rooted-android-device%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-right ')] +test_url: https://www.digitalforensics.com/blog/extracting-whatsapp-database-and-the-cipher-key-from-a-non-rooted-android-device/
\ No newline at end of file diff --git a/data/GrabberConfig/digitalspy.co.uk.txt b/data/GrabberConfig/digitalspy.co.uk.txt new file mode 100644 index 00000000..f48bdfdb --- /dev/null +++ b/data/GrabberConfig/digitalspy.co.uk.txt @@ -0,0 +1,5 @@ +title: //div[@class="article_header"]/h1 +date: //div[@class="article_pub"]/span[@class="time"] +author: //div[@class="article_pub"]/span[@class="editors"]/a/text() +body: //div[@class="article_body clear_left"] +test_url: http://www.digitalspy.co.uk/movies/at-the-movies/a364066/top-5-super-bowl-movie-trailers-the-avengers-battleship-more.html
\ No newline at end of file diff --git a/data/GrabberConfig/dilbert.com.txt b/data/GrabberConfig/dilbert.com.txt new file mode 100644 index 00000000..b8788553 --- /dev/null +++ b/data/GrabberConfig/dilbert.com.txt @@ -0,0 +1,9 @@ +title: //a[@class="post-title"]/text() +title: //meta[@name="twitter:title"]/@content +body: //img[@class="img-responsive img-comic"] +author: string('Scott Adams') +date: //meta[@property="article:publish_date"]/@content + +test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/ +test_url: http://dilbert.com/strips/comic/2013-10-22 +test_url: http://feed.dilbert.com/dilbert/daily_strip diff --git a/data/GrabberConfig/dinamalar.com.txt b/data/GrabberConfig/dinamalar.com.txt new file mode 100644 index 00000000..bc315cf1 --- /dev/null +++ b/data/GrabberConfig/dinamalar.com.txt @@ -0,0 +1,19 @@ +title: //div[@class='newsdetbd'] +body: //div[@id='innerleft'] +#//p[@class = 'plnht'] +strip_image_src: /albums/ +strip: //div[@class='mrrt'] +prune: yes +strip_id_or_class: 'fdpd' +strip_id_or_class: 'epapt' +strip_id_or_class: 'newsrtwd' +strip_id_or_class: 'padtp' +strip_id_or_class: 'newdt' +strip_id_or_class: 'newdlt' +strip: //div[@id='selNotes'] +strip_id_or_class: 'clsNotes' +strip_id_or_class: 'clear' +strip_id_or_class: 'cmtwrap' +strip_id_or_class: 'sess' +strip_id_or_class: 'parents' +test_url: http://www.dinamalar.com/News_Detail.asp?Id=295725
\ No newline at end of file diff --git a/data/GrabberConfig/distributistreview.com.txt b/data/GrabberConfig/distributistreview.com.txt new file mode 100644 index 00000000..9fe9e44a --- /dev/null +++ b/data/GrabberConfig/distributistreview.com.txt @@ -0,0 +1,8 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' post_content_wrapper ')] +strip_id_or_class: post_share_text +strip_id_or_class: post_info_cat +strip_id_or_class: about_the_author +strip_id_or_class: widgettitle +strip_id_or_class: post_related + +test_url: http://distributistreview.com/distributism-economics-as-if-people-mattered/ diff --git a/data/GrabberConfig/dn.pt.txt b/data/GrabberConfig/dn.pt.txt new file mode 100644 index 00000000..051b8cb9 --- /dev/null +++ b/data/GrabberConfig/dn.pt.txt @@ -0,0 +1,9 @@ +single_page_link: concat('http://www.dn.pt/Common/print.aspx?content_id=', //input[@type='hidden' and @name='link-comments']/@value) +#<input type="hidden" name="link-comments" class="link-comments" value="3972244"> + +title: //h1 +author: //div[@class="Author"] + +strip: //div[@class="Patrocinio"] + +test_url: http://www.dn.pt/inicio/opiniao/interior.aspx?content_id=3972244&seccao=Alberto%20Gon%E7alves&tag=Opini%E3o%20-%20Em%20Foco&page=1
\ No newline at end of file diff --git a/data/GrabberConfig/dn.se.txt b/data/GrabberConfig/dn.se.txt new file mode 100644 index 00000000..9584abad --- /dev/null +++ b/data/GrabberConfig/dn.se.txt @@ -0,0 +1,42 @@ +title: //h1[contains(@class, 'article__headline')] + +body: //div[contains(@class, 'article__body-content') or contains(@class, 'article__lead') or contains(@class, 'image-box__container')] +body: //div[@id="article-content"] + + +# Ads +strip_id_or_class: advert-space + +# Read more, recommend, comments etc +strip_id_or_class: fbc-recommend +strip_id_or_class: recommend +strip_id_or_class: article-readers +strip_id_or_class: article-addons +strip_id_or_class: hook +#strip_id_or_class: right +strip_id_or_class: footer + +strip_id_or_class: ad-head +strip_id_or_class: atc-share-title + +# Other news +strip: //div[@id="mirrors"] + +# Author +author: //div[@id="byline"]/div/p/strong + +find_string: <noscript> +replace_string: <!-- removed --> +find_string: </noscript> +replace_string: <!-- removed --> + +#parser: html5php +prune: no +tidy: no + +# Date +date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11) + +test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade +test_contains: Ett tekniskt haveri tvingade +test_url: http://www.dn.se/rss/senaste-nytt diff --git a/data/GrabberConfig/dobreprogramy.pl.txt b/data/GrabberConfig/dobreprogramy.pl.txt new file mode 100644 index 00000000..972293bc --- /dev/null +++ b/data/GrabberConfig/dobreprogramy.pl.txt @@ -0,0 +1,6 @@ +title: //*[@class="news"]//h1[@class="title"] +author: //*[@class="news"]//*[@class="newsInfo"]/a +date: substring-before(//*[@class="news"]//*[@class="newsInfo"]/text(), ',') +body: //*[@class="news"]//*[@class="newsContent"] +footnotes: no +test_url: http://www.dobreprogramy.pl/Sony-konczy-z-Foldinghome-na-PS3,Aktualnosc,36899.html
\ No newline at end of file diff --git a/data/GrabberConfig/domusweb.it.txt b/data/GrabberConfig/domusweb.it.txt new file mode 100644 index 00000000..20566ee3 --- /dev/null +++ b/data/GrabberConfig/domusweb.it.txt @@ -0,0 +1,21 @@ +# TODO: clean up the extra junk at the end of articles + +# general text formatting +prune: no +convert_double_br_tags:yes + +# where to find the basic metadata +author://a[@class='articleauthor'] +date://a[starts-with(@href,'/en/search/published/')] +title:substring-before(//h2[@class='title'],'—') +body://div[@id='maincontainer'] + +dissolve://div[starts-with(@id,'commentableblock')] + +# clean up the crap +strip://div[contains(@class,'domusnetwork')] +strip://div[contains(@class,'relative_wrapper')] + +strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')] +wrap_in(em): //div[contains(@class,'captionsubimage')]/span +test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/
\ No newline at end of file diff --git a/data/GrabberConfig/dorkly.com.txt b/data/GrabberConfig/dorkly.com.txt new file mode 100644 index 00000000..34f084cb --- /dev/null +++ b/data/GrabberConfig/dorkly.com.txt @@ -0,0 +1,13 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.dorkly.com%2Fpost%2F77499%2Fgeekiest-office-supplies-ever-made%3Fref%3Dhomepage + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] + +strip_id_or_class: related-links +strip_id_or_class: pagination + +next_page_link: //div[@class = 'pagination']/a[@class = 'next icon-circle-next'] + +test_url: http://www.dorkly.com/post/77499/geekiest-office-supplies-ever-made?ref=homepage +test_url: http://www.dorkly.com/post/73816/a-comic-of-ice-and-fire-a-hero-emerges diff --git a/data/GrabberConfig/dou.ua.txt b/data/GrabberConfig/dou.ua.txt new file mode 100644 index 00000000..0f983112 --- /dev/null +++ b/data/GrabberConfig/dou.ua.txt @@ -0,0 +1,8 @@ +title: //h1[@itemprop="name"] + +author: //div[contains(@class, 'author')]//div[contains(@class, 'name')]/a + +date: //div[contains(@class, 'b-info')]//span[contains(@class, 'date')] + +body: //div[contains(@class, 'b-typo')] +test_url: http://dou.ua/lenta/interviews/andrej-havryuchenko/?from=sb_mostcomm
\ No newline at end of file diff --git a/data/GrabberConfig/douban.com.txt b/data/GrabberConfig/douban.com.txt new file mode 100644 index 00000000..9f27a6fe --- /dev/null +++ b/data/GrabberConfig/douban.com.txt @@ -0,0 +1,21 @@ +# This filter is tested on: +# http://www.douban.com/note/215003067/ +# http://www.douban.com/note/213540049/ +# http://www.douban.com/group/topic/31140104/ + +title: //div[@class='note-header']/h1 +title: //div[@id='content']/h1 + +author: //div[@class='info']/ul/li/a +author: //h3/span/a + +date://div[@class='note-header']/div/span +date://h3/span[contains(@class, 'color-green')] + +body://div[contains(@class, 'note')] +body://div[contains(@class, 'topic-content')] + +strip://h3 + +convert_double_br_tags: yes +test_url: https://www.douban.com/group/topic/31140104/ diff --git a/data/GrabberConfig/doughellmann.com.txt b/data/GrabberConfig/doughellmann.com.txt new file mode 100644 index 00000000..bf9d5fb4 --- /dev/null +++ b/data/GrabberConfig/doughellmann.com.txt @@ -0,0 +1,3 @@ +single_page_link: //a[.="Read more…"]/@href + +test_url: https://doughellmann.com/blog/2017/02/06/getopt-command-line-option-parsing-pymotw-3/ diff --git a/data/GrabberConfig/dpreview.com.txt b/data/GrabberConfig/dpreview.com.txt new file mode 100644 index 00000000..001c810f --- /dev/null +++ b/data/GrabberConfig/dpreview.com.txt @@ -0,0 +1,9 @@ +# next_page_link for product review +# example: http://www.dpreview.com/reviews/lytro/ +next_page_link: //img[@alt = 'Next page']/../@href + +# next_page_link for other articles +# example: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1 +next_page_link: //*[@class = 'pages']/*/td[@class = 'next enabled']/a +single_page_link: //a[contains(.,'Print view')] +test_url: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1
\ No newline at end of file diff --git a/data/GrabberConfig/dr.dk.txt b/data/GrabberConfig/dr.dk.txt new file mode 100644 index 00000000..d8ec1acf --- /dev/null +++ b/data/GrabberConfig/dr.dk.txt @@ -0,0 +1,9 @@ +title: //meta[@property='og:title']/@content +author: //div[@class='articleFunctions']//a +date: //meta[@name='pubdate']/@content + +# Can you strip elements from the body only? It is required here (`//div[@class='articleContent']/p` breaks for some reason) +body: //div[@class='articleContent'] + +tidy: no +test_url: http://www.dr.dk/Nyheder/Udland/2011/10/24/150115.htm
\ No newline at end of file diff --git a/data/GrabberConfig/drdobbs.com.txt b/data/GrabberConfig/drdobbs.com.txt new file mode 100644 index 00000000..b1a9db6f --- /dev/null +++ b/data/GrabberConfig/drdobbs.com.txt @@ -0,0 +1,2 @@ +single_page_link: //a[contains(@href, '/article/print')] +test_url: http://www.drdobbs.com/architecture-and-design/240001128
\ No newline at end of file diff --git a/data/GrabberConfig/drgoulu.com.txt b/data/GrabberConfig/drgoulu.com.txt new file mode 100644 index 00000000..b4fb71bb --- /dev/null +++ b/data/GrabberConfig/drgoulu.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.drgoulu.com%2F2012%2F10%2F07%2Fcomment-stocker-lenergie%2F + +body: //section[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: http://www.drgoulu.com/2012/10/07/comment-stocker-lenergie/ diff --git a/data/GrabberConfig/drive2.ru.txt b/data/GrabberConfig/drive2.ru.txt new file mode 100644 index 00000000..d500cb81 --- /dev/null +++ b/data/GrabberConfig/drive2.ru.txt @@ -0,0 +1,12 @@ +body: //div[@class = "description"] +body: //div[@id = "post"] + +strip_id_or_class: vcard +strip_id_or_class: journallist +strip_id_or_class: infobox +strip_id_or_class: terms +strip_id_or_class: replieslist +strip_id_or_class: communityside + + +test_url: http://www.drive2.ru/cars/audi/a6/a6_c5/elysey/journal/288230376151836654/
\ No newline at end of file diff --git a/data/GrabberConfig/dropbox.com.txt b/data/GrabberConfig/dropbox.com.txt new file mode 100644 index 00000000..3b51569f --- /dev/null +++ b/data/GrabberConfig/dropbox.com.txt @@ -0,0 +1,3 @@ +single_page_link: //a[@id='download_button_link'] + +test_url: https://www.dropbox.com/s/qmocfrco2t0d28o/Fluffbeast.docx diff --git a/data/GrabberConfig/drupal.org.txt b/data/GrabberConfig/drupal.org.txt new file mode 100644 index 00000000..2da3eb1c --- /dev/null +++ b/data/GrabberConfig/drupal.org.txt @@ -0,0 +1,8 @@ +title://h1 +author://div[@class="submitted"]/a +date:substring-after(//div[@class="meta"],'modified: ') +date:substring-after(//div[@class="submitted"],'on ') +body://div[@class="node-content"] +strip://div[@class="meta"] +strip_id_or_class:book-navigation +test_url: http://drupal.org/node/1327354
\ No newline at end of file diff --git a/data/GrabberConfig/dushumashang.com.txt b/data/GrabberConfig/dushumashang.com.txt new file mode 100644 index 00000000..6a50a77e --- /dev/null +++ b/data/GrabberConfig/dushumashang.com.txt @@ -0,0 +1,17 @@ +# This filter is tested on: +# http://www.dushumashang.com/2389 +# http://www.dushumashang.com/2415 +# http://www.dushumashang.com/2355 + +body://div[@class='main_content'] +#body://section[@class='entry_content fl'] +title://h2 +author://span[@class='article_author']/a +date://span[@class='pub_date']/time + +strip://span[@class='article_author'] +strip://span[@class='pub_date'] +strip://div[@class='page_turn'] +strip://span[@class='source_link']/em +wrap_in(strong)://span[@class='source_link']/a +test_url: http://www.dushumashang.com/2355
\ No newline at end of file diff --git a/data/GrabberConfig/dzone.com.txt b/data/GrabberConfig/dzone.com.txt new file mode 100644 index 00000000..2c1b40c4 --- /dev/null +++ b/data/GrabberConfig/dzone.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fdzone.com%2Farticles%2Fwhats-wrong-java-8-part-iv + +body: //div[contains(concat(' ',normalize-space(@class),' '),' content-html ')] +test_url: https://dzone.com/articles/whats-wrong-java-8-part-iv
\ No newline at end of file diff --git a/data/GrabberConfig/earther.com.txt b/data/GrabberConfig/earther.com.txt new file mode 100644 index 00000000..5c455563 --- /dev/null +++ b/data/GrabberConfig/earther.com.txt @@ -0,0 +1,16 @@ +body: //section[contains(@class, 'main')] +author: //span[contains(@class, 'display-name')] +date: //span[@class="date"] + +strip_id_or_class: related +strip: //aside +strip: //svg +# For Gumbo parsing <svg>, <math> namespaced elems +strip: //*[local-name() = 'svg'] +strip: //div[contains(@class, 'storytype-label-wrapper')] +strip: //figcaption[contains(@class, 'caption')] + +prune: yes +tidy: no + +test_url: https://earther.com/ireland-is-officially-the-first-country-to-divest-from-1827552460 diff --git a/data/GrabberConfig/eastoftheweb.com.txt b/data/GrabberConfig/eastoftheweb.com.txt new file mode 100644 index 00000000..36708da3 --- /dev/null +++ b/data/GrabberConfig/eastoftheweb.com.txt @@ -0,0 +1,18 @@ +title: //div[@class='title_text'] + +author: //div[@class='author_text'] + +body: //div[@class='story_text']/.. + +strip: //b + +strip_id_or_class: back_to_top +strip_id_or_class: author_text +strip_id_or_class: title_text + +wrap_in(center): //a + +dissolve: //a + +footnotes: no +test_url: http://www.eastoftheweb.com/short-stories/UBooks/Horl.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/eatsmarter.de.txt b/data/GrabberConfig/eatsmarter.de.txt new file mode 100644 index 00000000..53e5435f --- /dev/null +++ b/data/GrabberConfig/eatsmarter.de.txt @@ -0,0 +1,36 @@ +# author: kreativmonkey + +# Singlepage +single_page_link: //div[contains(@class, 'pane-eatsmarter-actionbar')]/div[@class='pane-content']/div[contains(@class, 'actionbar-print')]/div[@class='actionbar-item-action']/a + +# Article Information +title: //h1[contains(@class, 'title')] + +# Content +body: //div[contains(@class, 'recipe-printview')] + +# Cleanup +strip_id_or_class: field-name-field-video +strip_id_or_class: field-name-field-image +strip_id_or_class: preparation-step-count #evtl auch nicht! +strip_id_or_class: ad +strip_id_or_class: adsense +strip_id_or_class: print-list +strip_id_or_class: 71M_inreadads +strip_id_or_class: bring-app +strip_id_or_class: simplora-widget +strip_id_or_class: field-name-field-kitchen-utensils +strip_id_or_class: fivestar-wrapper +strip_id_or_class: pane-eatsmarter-latest-comments +strip_id_or_class: traffic-lights-wrapper +strip_id_or_class: logo +strip_id_or_class: traffic-lights-wrapper +strip_id_or_class: info-extra +strip_id_or_class: simplora-widget +strip_id_or_class: shoplink + +test_url: https://eatsmarter.de/rezepte/chili-schokoladen-sorbet +test_contains: Die Schokolade fein hacken + +test_url: https://eatsmarter.de/rezepte/thunfisch-wraps-1/druckansicht +test_contains: den Strunk herausschneiden diff --git a/data/GrabberConfig/ebay.com.txt b/data/GrabberConfig/ebay.com.txt new file mode 100644 index 00000000..f17e1f72 --- /dev/null +++ b/data/GrabberConfig/ebay.com.txt @@ -0,0 +1,5 @@ +body: //h1[@class='it-ttl'] | //div[@id='mainImgHldr'] | //span[@id='prcIsum'] + +strip_image_src: imgLoading_30x30.gif + +test_url: http://www.ebay.com/itm/BRAND-NEW-FM-Transmitter-Ca-r-Charger-iPhone-4S-4-4G-3GS-3G-2G-iPod-Touch-/190657497204
\ No newline at end of file diff --git a/data/GrabberConfig/ecetia.com.txt b/data/GrabberConfig/ecetia.com.txt new file mode 100644 index 00000000..d67e9103 --- /dev/null +++ b/data/GrabberConfig/ecetia.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://ecetia.com/2011/09/vida-de-jugon-vii-las-tres-es
\ No newline at end of file diff --git a/data/GrabberConfig/echo-online.de.txt b/data/GrabberConfig/echo-online.de.txt new file mode 100644 index 00000000..f72862b7 --- /dev/null +++ b/data/GrabberConfig/echo-online.de.txt @@ -0,0 +1,25 @@ +# Author: Marvin Dickhaus +# 2014-10-08 + +#Tidy just messes up the DOM +tidy: no + +title: //h1 +body: //h2 | //div[@id='artikelteaser'] | //div[@id='artikeltext'] + +#Strip +strip_image_src: artikel_a_merken.gif +strip: //div[@class='zusatzinfo'] + +#Author: substring is used to remove the " Von " prefix. +author: substring(//li[@class='artikelautor'], 5) + +date: //li[@class='artikeldatum'] + +#The first two URLs will at some point no longer show +#the full article. There is a time-based paywall +#installed. Using the feed should present valid output +#test_url: http://www.echo-online.de/art1231,5503063 +#test_url: http://www.echo-online.de/art1168,5502598 +test_url: http://www.echo-online.de/lokales/darmstadt/index.rss + diff --git a/data/GrabberConfig/econlog.econlib.org.txt b/data/GrabberConfig/econlog.econlib.org.txt new file mode 100644 index 00000000..729affd4 --- /dev/null +++ b/data/GrabberConfig/econlog.econlib.org.txt @@ -0,0 +1,6 @@ +title: //h1[@class="title"] +author: //div[@class="hosted"]/a +date: substring-after(//div[@class="dateline"]/text(), '|') + +strip: //a[@class="top" and @href="#"] +test_url: http://econlog.econlib.org/archives/2012/04/blinder_on_heal.html
\ No newline at end of file diff --git a/data/GrabberConfig/economie.gouv.fr.txt b/data/GrabberConfig/economie.gouv.fr.txt new file mode 100644 index 00000000..b0db03c1 --- /dev/null +++ b/data/GrabberConfig/economie.gouv.fr.txt @@ -0,0 +1,4 @@ +body: //div[contains(@class, 'txtVisu')] +prune: no + +test_url: http://www.economie.gouv.fr/dgccrf/Publications/Vie-pratique/Fiches-pratiques/Assurance
\ No newline at end of file diff --git a/data/GrabberConfig/economist.com.txt b/data/GrabberConfig/economist.com.txt new file mode 100644 index 00000000..e2198883 --- /dev/null +++ b/data/GrabberConfig/economist.com.txt @@ -0,0 +1,21 @@ +body: //div[@class='main-content'] +body: //article[contains(@class, 'resp-node')] +date: //time[@class='date-created'] +strip: //aside +prune: no + +strip_id_or_class: newsletter-form--inline + +requires_login: yes + +login_uri: https://www.economist.com/user/login +login_username_field: name +login_password_field: pass + +not_logged_in_xpath: //*[@id="user-login-masthead"]/div[@class='login-form'] + +test_url: http://www.economist.com/node/21528429 + +test_url: http://www.economist.com/news/essays/21623373-which-something-old-and-powerful-encountered-vault +test_contains: the calfskin pages are smooth +test_contains: Books will evolve online and off diff --git a/data/GrabberConfig/ecranlarge.com.txt b/data/GrabberConfig/ecranlarge.com.txt new file mode 100644 index 00000000..d962305b --- /dev/null +++ b/data/GrabberConfig/ecranlarge.com.txt @@ -0,0 +1,3 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' content-description ')] + +test_url: https://www.ecranlarge.com/films/news/997080-star-wars-les-derniers-jedi-de-nouvelles-photos-de-rey-et-luke-en-mode-badass diff --git a/data/GrabberConfig/edge-online.com.txt b/data/GrabberConfig/edge-online.com.txt new file mode 100644 index 00000000..47b80e8a --- /dev/null +++ b/data/GrabberConfig/edge-online.com.txt @@ -0,0 +1,13 @@ +title: //meta[@property="og:title"]/@content +body: //h2[@class='strapline'] | //article[contains(@class, 'node-article')] +date: //time[@pubdate]/@datetime +author: //span[@class='author-name'] +prune: no +tidy: no +strip: //footer + +replace_string(<p>[ pagebreak ]</p>): <!-- pagebreak --> + +single_page_link: //a[contains(@href, '?page=show')] + +test_url: http://www.edge-online.com/features/telling-modern-warfares-story
\ No newline at end of file diff --git a/data/GrabberConfig/edition.channel5belize.com.txt b/data/GrabberConfig/edition.channel5belize.com.txt new file mode 100644 index 00000000..6d5f170a --- /dev/null +++ b/data/GrabberConfig/edition.channel5belize.com.txt @@ -0,0 +1,9 @@ +title: //div[@id='singlePage']//h2 +body: //div[@id='singlePage']//div[contains(@class, 'post')] +strip: //a[@title='Email This Story'] +strip_id_or_class: sociable + +prune: no + +test_url: http://edition.channel5belize.com/archives/86016 +test_url: http://edition.channel5belize.com/feed
\ No newline at end of file diff --git a/data/GrabberConfig/edition.cnn.com.txt b/data/GrabberConfig/edition.cnn.com.txt new file mode 100644 index 00000000..e6639e95 --- /dev/null +++ b/data/GrabberConfig/edition.cnn.com.txt @@ -0,0 +1,13 @@ +body: //section[contains(@class, 'body-text')] + +strip_id_or_class: highlights + +# Avoid redirecting to 'unsupported browser' page +find_string: <meta http-equiv="refresh" +replace_string: <meta norefresh + +test_url: http://edition.cnn.com/2012/05/13/us/new-york-police-policy/index.html +test_contains: this discriminatory and ineffective practice + +test_url: http://rss.cnn.com/rss/edition.rss +test_url: http://rss.cnn.com/rss/edition_technology.rss diff --git a/data/GrabberConfig/eetimes.com.txt b/data/GrabberConfig/eetimes.com.txt new file mode 100644 index 00000000..a99777e6 --- /dev/null +++ b/data/GrabberConfig/eetimes.com.txt @@ -0,0 +1,8 @@ +body: //div[contains(@class, 'grayshowlinks')] + +next_page_link: //div[@id='sitecontentcol']//a[.='Next >'] +# Doesn't work (site doesn't always load full content in print view) +#single_page_link: //div[@id='sitecontentcol']//a[contains(@href, 'print=yes')] + +test_url: https://www.eetimes.com/document.asp?doc_id=1319966 +test_url: https://www.eetimes.com/rss_simple.asp diff --git a/data/GrabberConfig/eff.org.txt b/data/GrabberConfig/eff.org.txt new file mode 100644 index 00000000..bdcb6f6e --- /dev/null +++ b/data/GrabberConfig/eff.org.txt @@ -0,0 +1,3 @@ +author: //meta[@name="author"]/@content + +test_url: https://www.eff.org/deeplinks/2018/02/john-perry-barlow-internet-pioneer-1947-2018 diff --git a/data/GrabberConfig/ekultura.hu.txt b/data/GrabberConfig/ekultura.hu.txt new file mode 100644 index 00000000..3756027c --- /dev/null +++ b/data/GrabberConfig/ekultura.hu.txt @@ -0,0 +1,11 @@ +title: //h1[@class='style6 nevek'] + +body: //div[@class='bal3'] + + +prune: yes + +tidy: yes +convert_double_br_tags: yes + +test_url: http://ekultura.hu/olvasnivalo/egyeb/cikk/2010-12-15/interju-galvolgyi-judit-2010-december
\ No newline at end of file diff --git a/data/GrabberConfig/elance.com.txt b/data/GrabberConfig/elance.com.txt new file mode 100644 index 00000000..d4b0a9b8 --- /dev/null +++ b/data/GrabberConfig/elance.com.txt @@ -0,0 +1,3 @@ +body: //div[@id='jobDesc-bd']/p + +test_url: http://www.elance.com/j/xml-technical-intergration/23687172/
\ No newline at end of file diff --git a/data/GrabberConfig/elblogsalmon.com.txt b/data/GrabberConfig/elblogsalmon.com.txt new file mode 100644 index 00000000..f692047d --- /dev/null +++ b/data/GrabberConfig/elblogsalmon.com.txt @@ -0,0 +1,3 @@ +replace_string(sf-src): src + +test_url: https://www.elblogsalmon.com/economia/no-todo-fue-mal-con-el-euro-datos-que-indican-que-fue-una-buena-idea diff --git a/data/GrabberConfig/elconfidencial.com.txt b/data/GrabberConfig/elconfidencial.com.txt new file mode 100644 index 00000000..7d95d9d6 --- /dev/null +++ b/data/GrabberConfig/elconfidencial.com.txt @@ -0,0 +1,9 @@ +title: //div[@class='news-header-tit-box']//h1 +body: //div[@id='news-body-center'] +author: //a[@class='news-def-author'] +date: //time + +# first image inside the article +strip: //div[@id='news-body-center']//article + +test_url: http://www.elconfidencial.com/tecnologia/2017-02-06/microsoft-windows-google-chrome_1327589/ diff --git a/data/GrabberConfig/elderscrollsonline.com.txt b/data/GrabberConfig/elderscrollsonline.com.txt new file mode 100644 index 00000000..fa3892c6 --- /dev/null +++ b/data/GrabberConfig/elderscrollsonline.com.txt @@ -0,0 +1,22 @@ +date: //time +title: //h1[contains(@class, "alpha")] +body: //article[contains(@class, "news-post")] + +# fix dates - dates as they are won't work as strtotime doesn't understand format (03.28.2013) +replace_string(<time class="gamma">01.): <time class="gamma">January. +replace_string(<time class="gamma">02.): <time class="gamma">February. +replace_string(<time class="gamma">03.): <time class="gamma">March. +replace_string(<time class="gamma">04.): <time class="gamma">April. +replace_string(<time class="gamma">05.): <time class="gamma">May. +replace_string(<time class="gamma">06.): <time class="gamma">June. +replace_string(<time class="gamma">07.): <time class="gamma">July. +replace_string(<time class="gamma">08.): <time class="gamma">August. +replace_string(<time class="gamma">09.): <time class="gamma">September. +replace_string(<time class="gamma">10.): <time class="gamma">October. +replace_string(<time class="gamma">11.): <time class="gamma">November. +replace_string(<time class="gamma">12.): <time class="gamma">December. + +prune: no + +test_url: http://elderscrollsonline.com/en/rss +test_url: http://elderscrollsonline.com/en/news/post/2013/03/27/developer-question-of-the-week-17
\ No newline at end of file diff --git a/data/GrabberConfig/eleconomista.es.txt b/data/GrabberConfig/eleconomista.es.txt new file mode 100644 index 00000000..18a57d53 --- /dev/null +++ b/data/GrabberConfig/eleconomista.es.txt @@ -0,0 +1,4 @@ +title: //h1 +body: //div[@id='cuerpo_noticia']/p + +test_url: http://www.eleconomista.es/construccion-inmobiliario/noticias/9052821/04/18/Asi-es-el-puente-maritimo-mas-largo-del-mundo-luces-y-sombras-del-viaducto-que-unira-Macao-Hong-Kong-y-Zhuhai.html diff --git a/data/GrabberConfig/elektroniknet.de.txt b/data/GrabberConfig/elektroniknet.de.txt new file mode 100644 index 00000000..1c65dff3 --- /dev/null +++ b/data/GrabberConfig/elektroniknet.de.txt @@ -0,0 +1,11 @@ +date: //time +next_page_link: //li[@class='next']//a +strip: //aside + +# 1 page +test_url: http://www.elektroniknet.de/elektronik-automotive/wirtschaft/aus-quattro-gmbh-wird-audi-sport-136464.html +test_contains: etwa 1.200 Mitarbeiter an den Standorten Neckarsulm und Ingolstadt + +# 2 pages +test_url: http://www.elektroniknet.de/elektronik-automotive/sonstiges/machine-learning-wird-demnaechst-massiv-an-bedeutung-gewinnen-136362.html +test_contains: Ende 2014 übernahm er seine heutige Position als CEO von MBRDNA diff --git a/data/GrabberConfig/elmalpensante.com.txt b/data/GrabberConfig/elmalpensante.com.txt new file mode 100644 index 00000000..435c6c20 --- /dev/null +++ b/data/GrabberConfig/elmalpensante.com.txt @@ -0,0 +1,4 @@ +single_page_link: //a[contains(@href, 'print_contenido')] +title: //h2 +author: //div[@class="autor"] +test_url: http://www.elmalpensante.com/index.php?doc=display_contenido&id=668
\ No newline at end of file diff --git a/data/GrabberConfig/elmundo.es.txt b/data/GrabberConfig/elmundo.es.txt new file mode 100644 index 00000000..53533e02 --- /dev/null +++ b/data/GrabberConfig/elmundo.es.txt @@ -0,0 +1,8 @@ +title: //h1[@class='js-headline'] +body: //div[@itemprop='articleBody'] +author: //li[@class='author-name'] + +strip: //figure[0] +strip: //aside + +test_url: http://www.elmundo.es/cataluna/2018/01/14/5a5b3be9e5fdea3f118b45c6.html diff --git a/data/GrabberConfig/elpais.com.txt b/data/GrabberConfig/elpais.com.txt new file mode 100644 index 00000000..de8e8a4b --- /dev/null +++ b/data/GrabberConfig/elpais.com.txt @@ -0,0 +1,23 @@ +title: //meta[@name='DC.title']/@content +title: //div[contains(@class, 'cabecera_noticia')]//h1 +date: //meta[@name='DC.date']/@content +date: //meta[@name='date']/@content +body: //div[@class='columna_texto'] +body: //div[@id='cuerpo_noticia'] +body: //div[@class='estructura_2col_1zq']//div[@class='margen_n'] + +prune: no + +strip_id_or_class: disposicion_vertical +strip_id_or_class: ampliar_foto +strip_id_or_class: utilidades +strip_id_or_class: info_relacionada +strip_id_or_class: m-kiosko +strip_id_or_class: info_complementa + +strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] +strip: //div[@id='coment' or @id='foros_not'] +strip: //picture//source + +test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html +test_url: http://elpais.com/diario/2012/02/07/cultura/1328569202_850215.html diff --git a/data/GrabberConfig/emaratalyoum.com.txt b/data/GrabberConfig/emaratalyoum.com.txt new file mode 100644 index 00000000..749491b7 --- /dev/null +++ b/data/GrabberConfig/emaratalyoum.com.txt @@ -0,0 +1,8 @@ +body: //div[@id='article-body' or contains(@class, 'articleinlinegallery')] + +prune: no + +test_url: http://www.emaratalyoum.com/politics/news/2015-10-10-1.828836 +test_url: http://www.emaratalyoum.com/sports/arab-and-international/2013-08-29-1.601844 +test_url: http://www.emaratalyoum.com/sports/arab-and-international/2013-08-29-1.601842 +test_url: http://www.emaratalyoum.com/public-sports-1.533088?ot=ot.AjaxPageLayout
\ No newline at end of file diff --git a/data/GrabberConfig/en.espnf1.com.txt b/data/GrabberConfig/en.espnf1.com.txt new file mode 100644 index 00000000..2ca0216b --- /dev/null +++ b/data/GrabberConfig/en.espnf1.com.txt @@ -0,0 +1,10 @@ +body: //div[@id='content'] +strip: //div[@class='rl'] +strip: //p[@class='authdesc'] +strip: //p[@class='strybtm'] +strip: //div[@id='stryFtrLft'] +strip: //div[@id='f1Conversation'] +strip: //div[@id='cmtSpncrRuler'] +strip: //div[@id='stryComments'] +strip: //div[@id='athrData'] +test_url: http://en.espnf1.com/monaco/motorsport/story/50529.html
\ No newline at end of file diff --git a/data/GrabberConfig/engadget.com.txt b/data/GrabberConfig/engadget.com.txt new file mode 100644 index 00000000..f5f7cf51 --- /dev/null +++ b/data/GrabberConfig/engadget.com.txt @@ -0,0 +1,6 @@ +title: //meta[@property="og:title"]/@content +body: //div[contains(@class, 'o-article_block')] + +prune: no + +test_url: http://www.engadget.com/2011/05/20/screen-grabs-the-mentalist-takes-the-ipad-to-new-heights/
\ No newline at end of file diff --git a/data/GrabberConfig/engineering.tumblr.com.txt b/data/GrabberConfig/engineering.tumblr.com.txt new file mode 100644 index 00000000..48f301fe --- /dev/null +++ b/data/GrabberConfig/engineering.tumblr.com.txt @@ -0,0 +1,7 @@ +title: //h2 +body: //div[@class="post_content"] +author: //p[@class="author"]/a +date: //p[@class="date"] +strip: //h2 +strip: //header +test_url: http://engineering.tumblr.com/post/21276808338/tumblr-firehose
\ No newline at end of file diff --git a/data/GrabberConfig/english.aljazeera.net.txt b/data/GrabberConfig/english.aljazeera.net.txt new file mode 100644 index 00000000..97365994 --- /dev/null +++ b/data/GrabberConfig/english.aljazeera.net.txt @@ -0,0 +1,7 @@ +title: //span[@id='DetailedTitle'] +body: //div[@id='ctl00_cphBody_dvArticleInfoBlock'] | //td[@class='DetailedSummary'] +strip_id_or_class: sidebar +strip_id_or_class: Skyscrapper_Body +strip: //td[@class='DetailedSummary']/table[position() != 1] +prune: no +test_url: http://english.aljazeera.net//news/middleeast/2011/04/20114681444376835.html
\ No newline at end of file diff --git a/data/GrabberConfig/enikos.gr.txt b/data/GrabberConfig/enikos.gr.txt new file mode 100644 index 00000000..ddd51c4b --- /dev/null +++ b/data/GrabberConfig/enikos.gr.txt @@ -0,0 +1,9 @@ +body: //div[@id='article']//div[contains(@class, 'inside')] + +strip_id_or_class: tags +strip_id_or_class: actions +strip_id_or_class: google-ads + +prune: no + +test_url: http://www.enikos.gr/politics/98606,To_oxi_toy_Agorastoy_stoys_Germanoys.html
\ No newline at end of file diff --git a/data/GrabberConfig/entertainment.timesonline.co.uk.txt b/data/GrabberConfig/entertainment.timesonline.co.uk.txt new file mode 100644 index 00000000..a756c457 --- /dev/null +++ b/data/GrabberConfig/entertainment.timesonline.co.uk.txt @@ -0,0 +1,10 @@ +author://div[@class = 'article-author']/span[@class = 'byline'] +title://h1[@class = 'heading'] +body://div[@id = 'related-article-links'] +strip://div[@id = 'comment-sort-order'] +strip://div[@id = 'my-profile'] +strip://div[@class = 'article-author'] +strip://div[@class = 'bg-f8f1d8 width-385 text-left'] +strip://div[@id = 'login-status'] +strip://div[@class = 'puff-padding'] +test_url: http://entertainment.timesonline.co.uk/tol/arts_and_entertainment/the_tls/article7177738.ece
\ No newline at end of file diff --git a/data/GrabberConfig/entwickler.de.txt b/data/GrabberConfig/entwickler.de.txt new file mode 100644 index 00000000..316f3991 --- /dev/null +++ b/data/GrabberConfig/entwickler.de.txt @@ -0,0 +1,7 @@ +title: //h1[@class="post-title"] +body: //section[@class="article-content"] +author: //div[@class="post-bottom-meta"]/span[@class="post-author"] +date: //div[@class="post-date"]/time/@datetime + +test_url: https://entwickler.de/online/mobile-welt-offline-welt-was-der-offline-first-ansatz-fuer-app-entwickler-heisst-140602.html +test_url: https://entwickler.de/online/development/plex-docker-joomla-165345.html diff --git a/data/GrabberConfig/enviscope.com.txt b/data/GrabberConfig/enviscope.com.txt new file mode 100644 index 00000000..990920ae --- /dev/null +++ b/data/GrabberConfig/enviscope.com.txt @@ -0,0 +1,21 @@ + + +body: //div[@id='content-texte'] +title: //html/head/title + +test_url: https://www.enviscope.com/environnement/electricite-des-industriels-flexi-consommateurs-se-regroupent/63151 + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' btn-abo-container ')] +login_uri: https://www.enviscope.com/login +login_username_field: log +login_password_field: pwd +login_extra_fields: action=login +login_extra_fields: instance= +login_extra_fields: redirect_to=https://www.enviscope.com/wp-admin/ +login_extra_fields: rememberme=forever +login_extra_fields: wp-submit=Se+connecter + diff --git a/data/GrabberConfig/ericsuh.com.txt b/data/GrabberConfig/ericsuh.com.txt new file mode 100644 index 00000000..d25140c5 --- /dev/null +++ b/data/GrabberConfig/ericsuh.com.txt @@ -0,0 +1,4 @@ +date: //h6[@class='datetime']/child::text() +author: string("Eric J. Suh") +footnotes: yes +test_url: http://www.ericsuh.com/blog/posts/2012/8/strange-numbers.html
\ No newline at end of file diff --git a/data/GrabberConfig/ernestmag.fr.txt b/data/GrabberConfig/ernestmag.fr.txt new file mode 100644 index 00000000..742f485d --- /dev/null +++ b/data/GrabberConfig/ernestmag.fr.txt @@ -0,0 +1,31 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-video ')] | //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] + +author: //div[contains(concat(' ',normalize-space(@class),' '),' post-author ')]//a[@rel='author'] + +prune: no + +strip: //script +strip_id_or_class: post-meta +strip_id_or_class: post-title +strip_id_or_class: post-category +strip_id_or_class: woocommerce +strip_id_or_class: article-cta + +test_url: https://www.ernestmag.fr/2018/06/18/coulon-en-chair-en-os-et-en-voix/ +test_url: https://www.ernestmag.fr/2018/07/31/decouvertes-ernest-de-lannee/ +test_url: https://www.ernestmag.fr/2018/04/03/sous-le-manteau-du-volcan/ + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +login_uri: https://www.ernestmag.fr/mon-compte/ +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' wc-memberships-content-restricted-message ')] +login_username_field: username +login_password_field: password +login_extra_fields: woocommerce-login-nonce=@=xpath('//input[@name="woocommerce-login-nonce"]', request_html('https://www.ernestmag.fr/mon-compte/')) +login_extra_fields: _wp_http_referer=/mon-compte/ +login_extra_fields: login=Se+connecter +login_extra_fields: rememberme=forever +# Remark: login to ernestmag.fr could fail if more than 2 active sessions are detected diff --git a/data/GrabberConfig/escapistmagazine.com.txt b/data/GrabberConfig/escapistmagazine.com.txt new file mode 100644 index 00000000..fd453a19 --- /dev/null +++ b/data/GrabberConfig/escapistmagazine.com.txt @@ -0,0 +1,8 @@ +title: //h1[@class='headline']/div[@class='name'] + +strip_image_src: 'http://cdn.themis-media.com/media/global/images/library/deriv/115/115825.png' + +next_page_link: //a[@class='next_page'] + +strip_comments: no +test_url: http://www.escapistmagazine.com/articles/view/columns/criticalintel/10302-I-Hate-Magic
\ No newline at end of file diff --git a/data/GrabberConfig/esglobal.org.txt b/data/GrabberConfig/esglobal.org.txt new file mode 100644 index 00000000..6408f441 --- /dev/null +++ b/data/GrabberConfig/esglobal.org.txt @@ -0,0 +1,3 @@ +body: //div[@class='blog-content'] + +test_url: http://www.esglobal.org/el-caos-en-el-este-los-socios-de-la-ue-necesitan-que-se-les-preste-atencion/ diff --git a/data/GrabberConfig/espacepolitique.revues.org.txt b/data/GrabberConfig/espacepolitique.revues.org.txt new file mode 100644 index 00000000..c4acb26e --- /dev/null +++ b/data/GrabberConfig/espacepolitique.revues.org.txt @@ -0,0 +1,10 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fespacepolitique.revues.org%2F284 + +body: //div[@id='text'] + +strip_id_or_class: paranumber +strip_id_or_class: go-top + +test_url: http://espacepolitique.revues.org/284 diff --git a/data/GrabberConfig/espn.go.com.txt b/data/GrabberConfig/espn.go.com.txt new file mode 100644 index 00000000..06476296 --- /dev/null +++ b/data/GrabberConfig/espn.go.com.txt @@ -0,0 +1,12 @@ +title: //div[@class='headline'] | //div[@class='mod-header']/h3 +body: //div[contains(@class, 'article')] +strip: //div[contains(@class, 'mod-inline')] +strip: //*/span[@class='page-actions'] +strip: //div[@class='page-actions']/* +strip: //div[@class='headline'] | //div[@class='mod-header']/h3 +strip: //div[@class='mod-blog-navigation'] +strip: //div[@class='monthday'] +strip: //div[@class='time'] +strip: //div[@class='timeofday'] +strip: //div[contains(@class, 'mod-conversations')] +test_url: http://espn.go.com/boston/mlb/story/_/id/7092528/terry-francona-victim-latest-red-sox-smear-campaign
\ No newline at end of file diff --git a/data/GrabberConfig/esquire.com.txt b/data/GrabberConfig/esquire.com.txt new file mode 100644 index 00000000..b9cb1e55 --- /dev/null +++ b/data/GrabberConfig/esquire.com.txt @@ -0,0 +1,11 @@ +title: //h1 +author: //div[@id='byline'] + +body: //div[@id='printBody'] + +single_page_link: concat('http://www.esquire.com/print-this/', substring-after(//link[@rel='canonical']/@href, 'esquire.com/')) + +prune: no + +test_url: http://www.esquire.com/features/impossible/price-is-right-perfect-bid-0810 +test_url: http://www.esquire.com/blogs/politics/police-getting-leftover-armoured-iraq-trucks-112513
\ No newline at end of file diff --git a/data/GrabberConfig/essonneinfo.fr.txt b/data/GrabberConfig/essonneinfo.fr.txt new file mode 100644 index 00000000..942e6bb3 --- /dev/null +++ b/data/GrabberConfig/essonneinfo.fr.txt @@ -0,0 +1,26 @@ + +body: //article[contains(concat(' ',normalize-space(@class),' '),' Post ')] + +author: //*[@class='Post-header']//a[@rel='author'] + +strip_id_or_class: Post-header +strip_id_or_class: Post-share +strip_id_or_class: Post-private +strip_id_or_class: Post-actions +strip_id_or_class: Bloc-auteur +strip_id_or_class: Post-then + +test_url: https://www.essonneinfo.fr/91-essonne-info/122987/download-festival-retour-evenement-explosif/ + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +login_uri: https://www.essonneinfo.fr/wp-login.php +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' Post-private ')] +login_username_field: log +login_password_field: pwd +login_extra_fields: rememberme=forever +login_extra_fields: wp-submit=Se+connecter +test_url: https://www.essonneinfo.fr/91-essonne-info/123619/metro-18-a-loupe-environnementale/ + diff --git a/data/GrabberConfig/estadao.com.br.txt b/data/GrabberConfig/estadao.com.br.txt new file mode 100644 index 00000000..5ed8f7e4 --- /dev/null +++ b/data/GrabberConfig/estadao.com.br.txt @@ -0,0 +1,4 @@ +title: //h1[contains(concat(' ',normalize-space(@class),' '),' titulo ')] +body: //article[contains(concat(' ',normalize-space(@class),' '),' texto ')] + +test_url: http://ciencia.estadao.com.br/noticias/geral,22-mil-toneladas-de-fosforo-do-saara-fertilizam-a-amazonia,1640532 diff --git a/data/GrabberConfig/eternabuenosaires.com.txt b/data/GrabberConfig/eternabuenosaires.com.txt new file mode 100644 index 00000000..bfa2c5dc --- /dev/null +++ b/data/GrabberConfig/eternabuenosaires.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://eternabuenosaires.com/2011/09/calle-adolfo-bioy-casares
\ No newline at end of file diff --git a/data/GrabberConfig/eurogamer.net.txt b/data/GrabberConfig/eurogamer.net.txt new file mode 100644 index 00000000..8931becb --- /dev/null +++ b/data/GrabberConfig/eurogamer.net.txt @@ -0,0 +1,9 @@ +body: //p[@class='strapline'] | //div[@class='cover-image'] | //article[@class='hd'] +strip: //div[@class='social top'] +strip: //p[@class='byline'] + +date: //span[@itemprop='datePublished'] +author: //a[@itemprop='author']/text() + +test_url: http://www.eurogamer.net/articles/2014-08-20-bungie-ordered-to-return-shares-to-composer-marty-odonnell +test_url: http://www.eurogamer.net/articles/2014-08-20-invisible-inc-does-espionage-justice diff --git a/data/GrabberConfig/everydayfeminism.com.txt b/data/GrabberConfig/everydayfeminism.com.txt new file mode 100644 index 00000000..d7d911f5 --- /dev/null +++ b/data/GrabberConfig/everydayfeminism.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Feverydayfeminism.com%2F2017%2F05%2Fsupport-autistic-community%2F + +body: //article[contains(concat(' ',normalize-space(@class),' '),' post ')] +test_url: http://everydayfeminism.com/2017/05/support-autistic-community/ diff --git a/data/GrabberConfig/evo.co.uk.txt b/data/GrabberConfig/evo.co.uk.txt new file mode 100644 index 00000000..ccb4f879 --- /dev/null +++ b/data/GrabberConfig/evo.co.uk.txt @@ -0,0 +1,11 @@ +author: substring-after(//div[@class='articleauthor'],'By ') + +# Blog posts +date: //div[@class='articledate'] +# News +date: //div[@class='articledate_b'] + +body: //div[@class='articletext'] + +convert_double_br_tags: yes +test_url: http://www.evo.co.uk/carreviews/evolongtermtests/280072/bmw_330d_sport_touring.html
\ No newline at end of file diff --git a/data/GrabberConfig/eweek.com.txt b/data/GrabberConfig/eweek.com.txt new file mode 100644 index 00000000..115ff49f --- /dev/null +++ b/data/GrabberConfig/eweek.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.eweek.com%2Fsecurity%2Fresearchers-from-google-cti-break-sha-1-hash-encryption-function.html + +body: //div[contains(concat(' ',normalize-space(@class),' '),' article_body ')] +test_url: http://www.eweek.com/security/researchers-from-google-cti-break-sha-1-hash-encryption-function.html diff --git a/data/GrabberConfig/explosm.net.txt b/data/GrabberConfig/explosm.net.txt new file mode 100644 index 00000000..f2d0a20f --- /dev/null +++ b/data/GrabberConfig/explosm.net.txt @@ -0,0 +1,4 @@ +body: //img[@id='main-comic'] +author: substring(//small[@class="author-credit-name"], 4) + +test_url: http://explosm.net/comics/3954/ diff --git a/data/GrabberConfig/expressen.se.txt b/data/GrabberConfig/expressen.se.txt new file mode 100644 index 00000000..d81d3251 --- /dev/null +++ b/data/GrabberConfig/expressen.se.txt @@ -0,0 +1,10 @@ +title: //h1[contains(@class, 'b-headline_article')] +body: //div[contains(@class, 'b-article_print')] + +single_page_link: //div[contains(@class, 'b-page__footer__actions')]//a[contains(@href, 'print=true')] + +prune: no + +test_url: http://www.expressen.se/kultur/1.2683904/medan-natet-dras-at +test_url: http://www.expressen.se/gt/polis-om-styckmordet-extremt-markligt-fall/ +test_url: http://www.expressen.se/Pages/OutboundFeedsPage.aspx?id=3642159&viewstyle=rss
\ No newline at end of file diff --git a/data/GrabberConfig/expresso.sapo.pt.txt b/data/GrabberConfig/expresso.sapo.pt.txt new file mode 100644 index 00000000..29961e16 --- /dev/null +++ b/data/GrabberConfig/expresso.sapo.pt.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fexpresso.sapo.pt%2Feconomia%2F2017-08-07-Lisboa-e-a-nova-Berlim-diz-a-CNN + +body: //main[contains(concat(' ',normalize-space(@class),' '),' mainContent ')] +test_url: http://expresso.sapo.pt/economia/2017-08-07-Lisboa-e-a-nova-Berlim-diz-a-CNN
\ No newline at end of file diff --git a/data/GrabberConfig/extracine.com.txt b/data/GrabberConfig/extracine.com.txt new file mode 100644 index 00000000..52b598da --- /dev/null +++ b/data/GrabberConfig/extracine.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://extracine.com/2011/09/straw-dogs-la-original
\ No newline at end of file diff --git a/data/GrabberConfig/facebook.com.txt b/data/GrabberConfig/facebook.com.txt new file mode 100644 index 00000000..3aefc615 --- /dev/null +++ b/data/GrabberConfig/facebook.com.txt @@ -0,0 +1,20 @@ +body: //div[@id='imagestage'] +body: //div[contains(@class, 'userContentWrapper')] +body: //div[@id='m_story_permalink_view' or contains(@data-sigil, 'm-story-view')] +strip_id_or_class: commentable +strip: //div[contains(@data-sigil, 'm-mentions-expand')] + +http_header(user-agent): PHP/5.3 +strip_id_or_class: copyright +# this remove the cookie policy banner +strip_id_or_class: fbPageBannerInner + +prune: no +tidy: no + +#single_page_link: concat("https://m.", substring-after(//link[@rel="alternate" and @media="handheld"]/@href, "//www.")) +#if_page_contains: //link[@rel="alternate" and @media="handheld"] + +test_url: https://www.facebook.com/permalink.php?story_fbid=10154584776550183&id=294468630182 +test_contains: holding an extraordinary session in Brussels this month +test_url: https://www.facebook.com/notes/protect-the-graph/retiring-sha-1-certificates/1814716098768533 diff --git a/data/GrabberConfig/facta.co.jp.txt b/data/GrabberConfig/facta.co.jp.txt new file mode 100644 index 00000000..f130568a --- /dev/null +++ b/data/GrabberConfig/facta.co.jp.txt @@ -0,0 +1,3 @@ +body: //div[@class='content'] + +test_url: http://facta.co.jp/blog/archives/20111026001026.html diff --git a/data/GrabberConfig/factuel.info.txt b/data/GrabberConfig/factuel.info.txt new file mode 100644 index 00000000..482c30ad --- /dev/null +++ b/data/GrabberConfig/factuel.info.txt @@ -0,0 +1,30 @@ + +author: //article//p[@class='group-inline']/a + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article ')] + +strip: //p[@class="group-inline"] +strip_id_or_class: field-name-field-type-article +strip_id_or_class: social-share +strip: //div[contains(concat(' ',normalize-space(@class),' '),' nopremium-message ')]/following-sibling::* +strip_id_or_class: nopremium-message +strip_id_or_class: field-name-nopremium-login +strip_id_or_class: field-name-nopremium-register + +test_url: http://www.factuel.info/article/cite-brulard-a-besancon-place-deconstructeurs-004924 + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +# Remark: login won't work as long as there is an HTTPS certificate issue on this website +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' field-name-nopremium-login ')] +login_uri: https://www.factuel.info/user +login_username_field: name +login_password_field: pass +login_extra_fields: op=Se connecter +login_extra_fields: form_id=user_login +login_extra_fields: form_build_id=@=xpath('//input[@name="form_build_id"]', request_html('https://www.factuel.info/user')) + +test_url: https://www.factuel.info/article/resistives-festival-eco-citoyen-ancre-dans-son-territoire-004927 + diff --git a/data/GrabberConfig/fakirpresse.info.txt b/data/GrabberConfig/fakirpresse.info.txt new file mode 100644 index 00000000..c58760c5 --- /dev/null +++ b/data/GrabberConfig/fakirpresse.info.txt @@ -0,0 +1,3 @@ +author: //a[@class="url fn spip_in"] + +test_url: https://www.fakirpresse.info/les-patrons-ca-osent-tout diff --git a/data/GrabberConfig/falter.at.txt b/data/GrabberConfig/falter.at.txt new file mode 100644 index 00000000..2bfcc9b4 --- /dev/null +++ b/data/GrabberConfig/falter.at.txt @@ -0,0 +1,14 @@ +title: //h1 +author: //a[contains(@href, '/kategorie/autoren')] +date: //a[contains(@href, '/falter/ausgabe')] +body: //article[@class='spanMain'] + +# cleanup +strip_id_or_class: 'respond' +strip: //img[@src='http://www.falter.at/web/_pics/falterlogo_dblau.gif'] +strip_id_or_class: 'meta' +strip_id_or_class: 'servicebox' +strip_id_or_class: 'related' +strip_id_or_class: 'twitter-share-button' +strip: //br +test_url: http://www.falter.at/falter/2013/03/26/der-dandy-auf-der-sinkenden-galeere/
\ No newline at end of file diff --git a/data/GrabberConfig/fanfiction.net.txt b/data/GrabberConfig/fanfiction.net.txt new file mode 100644 index 00000000..e7cab4d4 --- /dev/null +++ b/data/GrabberConfig/fanfiction.net.txt @@ -0,0 +1,6 @@ +body: //*[@id = 'story text'] +author: //a[starts-with(@href, '/u/')] +next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") +autodetect_next_page:yes +strip_id_or_class: 'a2a_kit' +test_url: http://www.fanfiction.net/s/6497403/1/Spartan_Love
\ No newline at end of file diff --git a/data/GrabberConfig/fastcompany.com.txt b/data/GrabberConfig/fastcompany.com.txt new file mode 100644 index 00000000..bf8375ee --- /dev/null +++ b/data/GrabberConfig/fastcompany.com.txt @@ -0,0 +1,20 @@ +author: //div[@class='byline']//a +date: //meta[@property='article:published_time']/@content +body: //figure[@class='jumbotron'] | //div[@itemprop='body'] + +prune: no + +#strip_id_or_class: article-top-wrapper +#strip_id_or_class: footer-message +#strip_id_or_class: print-logo +#strip: //cite +#strip://*[@class='timestamp'] +#strip://div[@id='page_right'] +#strip://section[@id='header_region'] +#strip://h1[@class='node-title'] +#strip://div[@class='node-submitted'] +#strip_id_or_class: skipnav + +test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity +test_contains: Some of you may have tried to reach me this morning +test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day diff --git a/data/GrabberConfig/faz.net.txt b/data/GrabberConfig/faz.net.txt new file mode 100644 index 00000000..ada0b0fc --- /dev/null +++ b/data/GrabberConfig/faz.net.txt @@ -0,0 +1,103 @@ +# Author: zinnober +# Complete rewrite of the faz.net template as the standard one is broken +# I tried to consider as many page variants as possible, which was some serious work + +tidy: no +prune: no + +# Title +title: //p[@class='Content HeadlineShort'] + +# Set author +author: substring-after(//span[@class='Autor'], 'von ') +author: //span[@class='caps last']/span[@class='caps last'] +author: //a[@rel='author'] + +# Set date +date: //span[@class='Datum'] +date: //span[@class='Datum'],/span + +# Fetch full multipage articles +next_page_link: //a[@title='Nächste Seite'] + +# Content is here +body: //div[@class='Artikel'] + +# Tidy up before article +strip: //div[@id='FAZHeaderNeu'] +strip: //h2[@itemprop='headline'] +strip: //span[@class='Datum'] +strip: //span[@class='Autor'] +strip_id_or_class: ArticlePagerTop + +# General cleanup +strip: //div[@class='clear'] +strip: //a[@title='Zur Homepage FAZ.NET'] +strip: //iframe +replace_string( · ): +strip_id_or_class: TeaserMore +strip_id_or_class: plista_alternativ + +# Remove tracking and ads +strip_image_src: /l.gif? +strip: //div[contains(@style, 'background-image')] +strip: //img[@width='1'] +strip_id_or_class: invisible +strip_id_or_class: Anzeige +strip_id_or_class: billboard + +# Remove various text boxes and social media foo +strip_id_or_class: WeitereBeitraege +strip_id_or_class: WBListe +strip_id_or_class: AutorenModul +strip_id_or_class: Community +strip_id_or_class: SocialMediaStatus +strip_id_or_class: RelatedLinkBox +strip_id_or_class: MultimediaNavigation +strip_id_or_class: IndexTitel + +# Fix picture caps and pictures (use better resolution and remove clutter) +strip_id_or_class: LightBoxOverlay +strip_id_or_class: exitLarge +strip_id_or_class: PagerBox +strip_id_or_class: Bildnachweis +strip_id_or_class: Bildueberschrift +strip_id_or_class: Bildbeschreibung +strip_id_or_class: ArtikelBild610 +strip_id_or_class: MediaLink +strip_id_or_class: FotoBoxInnerLeft +strip_id_or_class: BilderRelatedLinks + +# Remove clutter after article +strip_id_or_class: ArticlePagerBottom +strip_id_or_class: backToHome +strip_id_or_class: ArtikelAbbinder +strip_id_or_class: lesermeinungscontainer +strip_id_or_class: ThemenLinks +strip_id_or_class: rechtehinweis +strip_id_or_class: FAZArtikelMap +strip_id_or_class: FAZArtikelKommentare +strip_id_or_class: ArtikelKommentieren +strip_id_or_class: FAZArtikelFunktionen +strip_id_or_class: mailLB +strip_id_or_class: FAZContentRight +strip_id_or_class: stageModule +strip_id_or_class: ContentFooter +strip_id_or_class: ServicesFooter +strip_id_or_class: FAZFooter + +# Clean up stuff present just in some articles +strip_id_or_class: Teaser620 +strip_id_or_class: TeaserMultimedia +strip_id_or_class: VideoBox + +# Remove as soon as Wallabag maight be able to embed flash video +strip_id_or_class: mmoObjectAsTeaserInArticle +strip_id_or_class: additionalStylesAudioVideo +strip_id_or_class: hideMMElements + +# Try it yourself +test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken +test_url: http://www.faz.net/aktuell/politik/inland/allensbach-analyse-im-namen-des-volkes-13106492.html +test_url: http://www.faz.net/aktuell/feuilleton/kino/video-filmkritiken/video-filmkritik-when-animals-dream-zerrissene-jugend-13105772.html + diff --git a/data/GrabberConfig/feeds.feedblitz.com.txt b/data/GrabberConfig/feeds.feedblitz.com.txt new file mode 100644 index 00000000..1c08d8f7 --- /dev/null +++ b/data/GrabberConfig/feeds.feedblitz.com.txt @@ -0,0 +1 @@ +http_header(referer): http://feedblitz.com diff --git a/data/GrabberConfig/fertigung.de.txt b/data/GrabberConfig/fertigung.de.txt new file mode 100644 index 00000000..90145e58 --- /dev/null +++ b/data/GrabberConfig/fertigung.de.txt @@ -0,0 +1,23 @@ +title: //title + +body: //div[@id='content'] + +strip: (//div[@id='content']/h2)[1] + +strip: //h2[contains(., 'mehr News')]/following::* +strip: //h2[contains(., 'mehr News')] + +strip: //div[contains(@class, 'indizar')]/following::* +strip: //div[contains(@class, 'indizar')] + +strip: //h1[contains(@class, 'single')]/preceding::* +strip: //h1[contains(@class, 'single')] + +strip_id_or_class: plista_widget + +prune: no + +next_page_link: //a[contains(., 'Weiter')] + +test_url: http://www.fertigung.de/2013/04/igus-neuer-energiekettenkatalog/ +test_url: http://www.fertigung.de/2013/04/dynamisch-und-hochpraezise/
\ No newline at end of file diff --git a/data/GrabberConfig/fictionpress.com.txt b/data/GrabberConfig/fictionpress.com.txt new file mode 100644 index 00000000..19ec16b0 --- /dev/null +++ b/data/GrabberConfig/fictionpress.com.txt @@ -0,0 +1,5 @@ +body: id('storytext') +author: //a[starts-with(@href, '/u/')] +#next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") +strip_id_or_class: 'a2a_kit' +test_url: http://www.fictionpress.com/s/2897964/1/All_We_Knew
\ No newline at end of file diff --git a/data/GrabberConfig/ficwad.com.txt b/data/GrabberConfig/ficwad.com.txt new file mode 100644 index 00000000..081f0bb0 --- /dev/null +++ b/data/GrabberConfig/ficwad.com.txt @@ -0,0 +1,12 @@ +title: //h4 +author: //span[@class="author"] +body: //div[@id="story"] +strip_id_or_class: summary +strip_id_or_class: meta +strip_id_or_class: storyfoot +convert_double_br_tags: yes +prune: no + +# Note: this site still has trouble because single <br> tags are stripped, but I don't see a way to fix that with this interface. + +test_url: http://www.ficwad.com/story/158977
\ No newline at end of file diff --git a/data/GrabberConfig/filmstarts.de.txt b/data/GrabberConfig/filmstarts.de.txt new file mode 100644 index 00000000..e0274d49 --- /dev/null +++ b/data/GrabberConfig/filmstarts.de.txt @@ -0,0 +1,13 @@ +title: //div[@class='title large'] +author: //meta[@name='author']/@content + +body: //div[@id="col_main"] + +strip: //div[contains(@class, "samekind")] +strip: //div[contains(@class, "shareThis")] +strip: //div[@class="jtp"] +strip: //div[@id="showdisqus"] +strip: //div[@class="disqus_toggle"] +strip: //div[@class="lighten margin_20b"] + +test_url: http://www.filmstarts.de/nachrichten/18503731.html diff --git a/data/GrabberConfig/finance.yahoo.com.txt b/data/GrabberConfig/finance.yahoo.com.txt new file mode 100644 index 00000000..0c967db0 --- /dev/null +++ b/data/GrabberConfig/finance.yahoo.com.txt @@ -0,0 +1,12 @@ +title: //meta[@property='og:title']/@content +body: //div[@id='y-article-bd'] +body: //div[contains(@class, 'yom-art-content')] +strip: //div[contains(@class, 'related-companies')] +strip: //div[@id='y-article-related'] +strip: //div[@id='ypf-article-related'] +prune: no +tidy: no + +single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')] + +test_url: http://finance.yahoo.com/news/canadian-orebodies-gives-notice-exercise-130000032.html
\ No newline at end of file diff --git a/data/GrabberConfig/findtheswagger.tumblr.com.txt b/data/GrabberConfig/findtheswagger.tumblr.com.txt new file mode 100644 index 00000000..43aef750 --- /dev/null +++ b/data/GrabberConfig/findtheswagger.tumblr.com.txt @@ -0,0 +1,10 @@ +date: //div[@class='notes']/a +body: //div[@id='content'] + +strip_id_or_class: tags +strip_id_or_class: permalink +strip_id_or_class: notes +strip_id_or_class: post_nav +strip: //div[@id='content']//h2 +strip_id_or_class: right_column +test_url: http://findtheswagger.tumblr.com/post/11589145141/moe-resners-end-of-an-era-1957-giants-final
\ No newline at end of file diff --git a/data/GrabberConfig/finexpert.e15.cz.txt b/data/GrabberConfig/finexpert.e15.cz.txt new file mode 100644 index 00000000..7cf8feb5 --- /dev/null +++ b/data/GrabberConfig/finexpert.e15.cz.txt @@ -0,0 +1,3 @@ +strip_id_or_class: article-linktoanother + +test_url: http://finexpert.e15.cz/budiz-teplo-eu-stedre-zadotuje-nejen-plynovy-kotel
\ No newline at end of file diff --git a/data/GrabberConfig/firstthings.com.txt b/data/GrabberConfig/firstthings.com.txt new file mode 100644 index 00000000..ce972bac --- /dev/null +++ b/data/GrabberConfig/firstthings.com.txt @@ -0,0 +1,7 @@ +title: //div[@class='articleTitle'] +author: //div[@class='articleAuthor'] +body: //div[@class='articleContent'] +prune: no +convert_double_br_tags: yes + +test_url: http://www.firstthings.com/article/2011/05/the-trouble-with-ayn-rand
\ No newline at end of file diff --git a/data/GrabberConfig/fivefilters.org.txt b/data/GrabberConfig/fivefilters.org.txt new file mode 100644 index 00000000..f37f02b9 --- /dev/null +++ b/data/GrabberConfig/fivefilters.org.txt @@ -0,0 +1,4 @@ +body: //section[contains(@class, 'container')] +prune: no + +test_url: http://fivefilters.org/kindle-it/ diff --git a/data/GrabberConfig/flyingmachinestudios.com.txt b/data/GrabberConfig/flyingmachinestudios.com.txt new file mode 100644 index 00000000..2053f801 --- /dev/null +++ b/data/GrabberConfig/flyingmachinestudios.com.txt @@ -0,0 +1,2 @@ +strip_id_or_class: linenos +test_url: http://www.flyingmachinestudios.com/programming/whoops-dci-refactoring/
\ No newline at end of file diff --git a/data/GrabberConfig/fm4.orf.at.txt b/data/GrabberConfig/fm4.orf.at.txt new file mode 100644 index 00000000..1e2ef6e0 --- /dev/null +++ b/data/GrabberConfig/fm4.orf.at.txt @@ -0,0 +1,14 @@ +author: //div[@class='authorDescription']/h2 +body: //div[@id='story'] +title: //h1[@class='detail'] + +date: translate(translate(substring-before(substring-after(//p[@class='date'],'Erstellt am:'), ' Uhr'), '. ', '.'), '-', ' ') +strip: //p[@class='date'] + +strip: //p[@class='credit'] + +tidy: no +prune: no +parser: html5lib + +test_url: http://fm4.orf.at/stories/1689156/ diff --git a/data/GrabberConfig/fnal.gov.txt b/data/GrabberConfig/fnal.gov.txt new file mode 100644 index 00000000..e404ccb8 --- /dev/null +++ b/data/GrabberConfig/fnal.gov.txt @@ -0,0 +1,15 @@ +title: normalize(//h1) + +author: //td/p[position()=last()]/em + +# I swear, this is really the best way to do this +date: normalize(//td[contains(@style, "color: #ffffff")]) + +# my god, it's full of tables +body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td +strip: //h1 + +# the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output. +strip: //p[position()=last()]/em +strip: //p[position()=last()]/child::text() +test_url: http://www.fnal.gov/pub/today/archive_2011/today11-11-09_MuonDepartmentReadMore.html
\ No newline at end of file diff --git a/data/GrabberConfig/focus-numerique.com.txt b/data/GrabberConfig/focus-numerique.com.txt new file mode 100644 index 00000000..493e4980 --- /dev/null +++ b/data/GrabberConfig/focus-numerique.com.txt @@ -0,0 +1,8 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.focus-numerique.com%2Fpourquoi-ccd-va-disparaitre-news-9396.html + +body: //div[contains(concat(' ',normalize-space(@class),' '),' bloc_news ')] +strip_id_or_class: mail_reactions_news + +test_url: http://www.focus-numerique.com/pourquoi-ccd-va-disparaitre-news-9396.html diff --git a/data/GrabberConfig/focus.de.txt b/data/GrabberConfig/focus.de.txt new file mode 100644 index 00000000..6da3687e --- /dev/null +++ b/data/GrabberConfig/focus.de.txt @@ -0,0 +1,19 @@ +title: //h1 + +author: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] + +date: //div[@class='articleHead']/span[@class='created'] + +body: //div[@id='article'] + +strip: //span[@class='markerText'] +strip: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] +strip: //div[@class='sidebar'] +strip: //div[@class='starbar'] +strip: //div[@class='actions clearfix'] +strip: //div[@id='commentForm'] +strip: //div[@id='commentSent'] +strip: //div[@id='comments'] +strip: //div[@class='similarityBlock'] + +test_url: http://www.focus.de/politik/ausland/ein-jahr-nach-bombenanschlag-u-bahn-attentaeter-von-minsk-hingerichtet_aid_724958.html
\ No newline at end of file diff --git a/data/GrabberConfig/fok.nl.txt b/data/GrabberConfig/fok.nl.txt new file mode 100644 index 00000000..012f07df --- /dev/null +++ b/data/GrabberConfig/fok.nl.txt @@ -0,0 +1,4 @@ +# skip cookie warning +single_page_link: concat(//form/@action, '?allowcookies=yes') + +test_url: http://fok.nl/687116
\ No newline at end of file diff --git a/data/GrabberConfig/fokus.se.txt b/data/GrabberConfig/fokus.se.txt new file mode 100644 index 00000000..674a0f5b --- /dev/null +++ b/data/GrabberConfig/fokus.se.txt @@ -0,0 +1,3 @@ +title: //h2[contains(@class, 'entry-title')] +body: //div[contains(@class, 'entry-content')] +test_url: http://www.fokus.se/2017/03/olosta-karnfragor/ diff --git a/data/GrabberConfig/foley.com.txt b/data/GrabberConfig/foley.com.txt new file mode 100644 index 00000000..56ff407b --- /dev/null +++ b/data/GrabberConfig/foley.com.txt @@ -0,0 +1,5 @@ +http_header(user-agent): Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2 + +test_url: http://www.foley.com/rss/rss.aspx?id=2 +test_url: https://www.foley.com/solicitor-general-sides-with-sandoz-on-interpretation-of-biosimilar-statute-12-13-2016/ +test_contains: The Solicitor General of the United States diff --git a/data/GrabberConfig/folklore.org.txt b/data/GrabberConfig/folklore.org.txt new file mode 100644 index 00000000..ed23a0b6 --- /dev/null +++ b/data/GrabberConfig/folklore.org.txt @@ -0,0 +1,4 @@ +author: /html/body/table[3]/tbody/tr/td[1]/table[2]/tbody/tr[1]/td[2] +date: /html/body/table[3]/tbody/tr/td[1]/table[2]/tbody/tr[2]/td[2] +body: //div[@class='main'] +test_url: http://www.folklore.org/StoryView.py?story=Calculator_Construction_Set.txt
\ No newline at end of file diff --git a/data/GrabberConfig/food.com.txt b/data/GrabberConfig/food.com.txt new file mode 100644 index 00000000..a70da766 --- /dev/null +++ b/data/GrabberConfig/food.com.txt @@ -0,0 +1,11 @@ +body: //div[@id='print-area'] +title: //h1[contains(@class, 'section-title')] +single_page_link: //a[@id='prntrec'] +strip_image_src: food-logo-small +strip_id_or_class: timer +strip_id_or_class: photo-sm +strip_id_or_class: page-header + +prune: no + +test_url: http://www.food.com/recipe/couldnt-be-easier-bbq-pork-tenderloin-crock-pot-317152
\ No newline at end of file diff --git a/data/GrabberConfig/fool.com.txt b/data/GrabberConfig/fool.com.txt new file mode 100644 index 00000000..89cb8b9a --- /dev/null +++ b/data/GrabberConfig/fool.com.txt @@ -0,0 +1,11 @@ +body: //div[@class='entry-content'] +date: //meta[@name="date"]/@content +author: //meta[@name="author"]/@content + +strip_id_or_class: ecapShell +strip_id_or_class: noindent +strip_id_or_class: targetedPromotion + +prune: no + +test_url: http://www.fool.com/investing/general/2012/01/27/dfc-global-beats-up-on-analysts-yet-again.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/forbes.com.txt b/data/GrabberConfig/forbes.com.txt new file mode 100644 index 00000000..36d4757a --- /dev/null +++ b/data/GrabberConfig/forbes.com.txt @@ -0,0 +1,29 @@ +title: //hgroup//h1 +title: //span[@class='mainarttitle'] + +body: //div[@id='leftRail']//div[contains(@class, 'body')] + +author: //meta[@name="author"]/@content +author: //span[@class='mainartauthor'] + +date: substring-before(//hgroup//h6, '@') +date: //span[@class='mainartdate'] + +prune: no +strip: //aside +strip_id_or_class: sticky_sharing +strip_id_or_class: pagination +strip_id_or_class: controlsbox +strip_id_or_class: storyboxes +strip_id_or_class: sponsoredlinks +strip_id_or_class: nextpage +strip_id_or_class: contextuallinks +strip_id_or_class: article_actions +strip_id_or_class: engagement_block + +single_page_link: //a[contains(@href, '/print/')] + +http_header(user-agent): Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) + +test_url: http://www.forbes.com/forbes/2011/0509/technology-frog-design-jan-chipchase-ethnographer-birth-cool_print.html +test_url: http://www.forbes.com/sites/bruceupbin/2012/09/11/the-iphone-5-winners-and-losers/ diff --git a/data/GrabberConfig/foreignaffairs.com.txt b/data/GrabberConfig/foreignaffairs.com.txt new file mode 100644 index 00000000..cf8b742f --- /dev/null +++ b/data/GrabberConfig/foreignaffairs.com.txt @@ -0,0 +1,34 @@ +# TIDY +#tidy: no +# PRUNE +#prune: no + +# SINGLE PAGE +single_page_link: //div[@class='showlinks']/a + +# TITLE +title: //h1[@class="title"] + +# AUTHOR +author: //div[contains(@class,"field-field-article-display-authors")]/div/div/a/text() + +# DATE +date: //div[contains(@class,"field-field-article-issue")]/div/div/a/text() | //span[@class="date-display-single"] + +# BODY +body: //div[contains(@class,"content-resize")] + +# Remove clutter +strip: //div[@class="article-sidebar"] +strip: //div[@class="showlinks"] +strip: //div[contains(@class,"premium-box")] +strip: //div[contains(@class,"premium-box")] +strip: //table[contains(@border,"2")] + +# Fix picture captions +wrap_in(small): //p/img/following-sibling::em +wrap_in(small): //p[img]/text() + +# Fix sub-headlines +wrap_in(h3): //div[contains(@class,"field-field-article-subtitle")]/div/div/text() +test_url: http://www.foreignaffairs.com/articles/138810/pierre-n-leval/the-long-arm-of-international-law
\ No newline at end of file diff --git a/data/GrabberConfig/foreignpolicy.com.txt b/data/GrabberConfig/foreignpolicy.com.txt new file mode 100644 index 00000000..749ad37d --- /dev/null +++ b/data/GrabberConfig/foreignpolicy.com.txt @@ -0,0 +1,14 @@ +title: //h1[contains(concat(' ',normalize-space(@class),' '),' feature-hed ')] +author: substring-after(//p[contains(concat(' ',normalize-space(@class),' '),' feature-byline ')], 'BY ') +date: //div[contains(concat(' ',normalize-space(@class),' '),' timestamp ')]//time +body: //div[contains(concat(' ',normalize-space(@class),' '),' feature-body ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-inner ')] + +strip_id_or_class: share-links + +prune: no + +test_url: http://www.foreignpolicy.com/articles/2014/07/22/the_end_game_in_gaza_netanyahu_hamas +test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me +test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus +test_url: http://foreignpolicy.com/2016/04/26/fear-this-man-cyber-warfare-hacking-team-david-vincenzetti/ diff --git a/data/GrabberConfig/forsvaret.no.txt b/data/GrabberConfig/forsvaret.no.txt new file mode 100644 index 00000000..ec9e5807 --- /dev/null +++ b/data/GrabberConfig/forsvaret.no.txt @@ -0,0 +1,10 @@ +title: //div[@class="articleHeader"]/h1 +author: //p[@class="byline"] +date: //p[contains(@class,"publishedDate")]/span +# remove the right menu +strip: //div[contains(@class,"aside")] +# remove some SharePoint webpart label junk +strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"] +strip: //div[@id="ctl00_PlaceHolderMain_PublishingPageContentField_label"] +test_url: https://forsvaret.no/aktuelt/historisk-medaljeutdeling +test_contains: Samarbeidet med Marinen har vært en sann glede diff --git a/data/GrabberConfig/fossbytes.com.txt b/data/GrabberConfig/fossbytes.com.txt new file mode 100644 index 00000000..7874e099 --- /dev/null +++ b/data/GrabberConfig/fossbytes.com.txt @@ -0,0 +1,33 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Ffossbytes.com%2Fnsa-call-detail-records-unauthorized-hundreds-of-millions%2F + +title: //meta[@property='og:title']/@content +body: //div[contains(concat(' ',normalize-space(@class),' '),' td-ss-main-content ')] +date: //div[contains(@class, 'entry-meta')]//time[@pubdate or @pubDate] +author: //div[contains(@class, 'entry-meta')]//a[@rel='author'] + +prune: no + +strip: //nav +strip: //header +strip: //*[@id='comments' or @id='respond'] +strip: //div[contains(@class, 'comments')] +strip_id_or_class: sharedaddy +strip_id_or_class: wpadvert +strip_id_or_class: commentlist +strip_id_or_class: sociable +strip_id_or_class: related_post +strip_id_or_class: wp-socializer +strip_id_or_class: addtoany +strip: //iframe +strip: //div[contains(concat(' ',normalize-space(@class),' '),' navigation ')] +strip: //blockquote[contains(concat(' ',normalize-space(@class),' '),' td_quote_box ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' td-post-sharing-bottom ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' author-box-wrap ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' td-post-sharing-top ')] +strip: //img[contains(concat(' ',normalize-space(@class),' '),' entry-thumb ')] + + + +test_url: https://fossbytes.com/nsa-call-detail-records-unauthorized-hundreds-of-millions/ diff --git a/data/GrabberConfig/foxnews.com.txt b/data/GrabberConfig/foxnews.com.txt new file mode 100644 index 00000000..e19c77db --- /dev/null +++ b/data/GrabberConfig/foxnews.com.txt @@ -0,0 +1,9 @@ +prune: no + +author: //meta[@name="dc.publisher"]/@content +date: //meta[@name="dc.date"]/@content +strip: //p[contains(@class, 'contributor vcard')] +replace_string(<ul><li><div class="photo">): <div class="photo"> +strip: //p[a[contains(., 'Click here to read more on this story ')]] + +test_url: http://www.foxnews.com/entertainment/2011/05/04/dwayne-johnson-guys-grow-pair-driving-hybrid/
\ No newline at end of file diff --git a/data/GrabberConfig/framablog.org.txt b/data/GrabberConfig/framablog.org.txt new file mode 100644 index 00000000..e69f9ff3 --- /dev/null +++ b/data/GrabberConfig/framablog.org.txt @@ -0,0 +1,3 @@ +title: //h1 + +test_url: https://framablog.org/2018/05/31/peertube-vers-la-version-1-et-au-dela/ diff --git a/data/GrabberConfig/france24.com.txt b/data/GrabberConfig/france24.com.txt new file mode 100644 index 00000000..6356e048 --- /dev/null +++ b/data/GrabberConfig/france24.com.txt @@ -0,0 +1,14 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.france24.com%2Fen%2F20150427-togo-gnassingbe-poised-extend-power-election%2F + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article-long ')]//div[contains(concat(' ',normalize-space(@class),' '),' bd ')] +title: //h1[@class="title"] +author://p[@class="author"] +date://p[@class="modification"] + +find_string: <p class="modification">Latest update : +replace_string: <p class="modification"> + + +test_url: http://www.france24.com/en/20150427-togo-gnassingbe-poised-extend-power-election/
\ No newline at end of file diff --git a/data/GrabberConfig/franceculture.fr.txt b/data/GrabberConfig/franceculture.fr.txt new file mode 100644 index 00000000..b4e661dd --- /dev/null +++ b/data/GrabberConfig/franceculture.fr.txt @@ -0,0 +1,4 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' text-zone ')] +src_lazy_load_attr: data-dejavu-src + +test_url: https://www.franceculture.fr/emissions/entendez-vous-leco/paroles-deconomistes-24-passer-leconomie-le-role-de-luniversite diff --git a/data/GrabberConfig/freelancer.com.txt b/data/GrabberConfig/freelancer.com.txt new file mode 100644 index 00000000..78d37729 --- /dev/null +++ b/data/GrabberConfig/freelancer.com.txt @@ -0,0 +1,3 @@ +body: //div[@id="projectDetailsContent"]//td + +test_url: http://www.freelancer.com/projects/PHP-Website-Design/debug-Forum-website-code.html
\ No newline at end of file diff --git a/data/GrabberConfig/fria.nu.txt b/data/GrabberConfig/fria.nu.txt new file mode 100644 index 00000000..9d8eff97 --- /dev/null +++ b/data/GrabberConfig/fria.nu.txt @@ -0,0 +1,8 @@ +body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] +author: //article//div[contains(@class, 'field-byline')] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.fria.nu/artikel/112079 +test_url: http://www.fria.nu/taxonomy/term/1928/all/feed
\ No newline at end of file diff --git a/data/GrabberConfig/friatidningen.se.txt b/data/GrabberConfig/friatidningen.se.txt new file mode 100644 index 00000000..1e4abc5a --- /dev/null +++ b/data/GrabberConfig/friatidningen.se.txt @@ -0,0 +1,7 @@ +body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] +author: //article//div[contains(@class, 'field-byline')] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.friatidningen.se/artikel/112074
\ No newline at end of file diff --git a/data/GrabberConfig/frontburner.dmagazine.com.txt b/data/GrabberConfig/frontburner.dmagazine.com.txt new file mode 100644 index 00000000..73f44324 --- /dev/null +++ b/data/GrabberConfig/frontburner.dmagazine.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Ffrontburner.dmagazine.com%2F2015%2F04%2F23%2Fthe-barrett-brown-review-of-arts-and-letters-and-jail-a-sign-of-things-to-come%2F + +body: //div[@id='article-body'] +test_url: http://frontburner.dmagazine.com/2015/04/23/the-barrett-brown-review-of-arts-and-letters-and-jail-a-sign-of-things-to-come/ diff --git a/data/GrabberConfig/ft.com.txt b/data/GrabberConfig/ft.com.txt new file mode 100644 index 00000000..407c2453 --- /dev/null +++ b/data/GrabberConfig/ft.com.txt @@ -0,0 +1,6 @@ +body: //div[@id='storyContent'] + +strip_id_or_class: shareArt +strip_id_or_class: promobox + +test_url: http://www.ft.com/cms/s/2/19fe32bc-d6db-11e5-8887-98e7feb46f27.html diff --git a/data/GrabberConfig/ftchinese.com.txt b/data/GrabberConfig/ftchinese.com.txt new file mode 100644 index 00000000..5c94d9b0 --- /dev/null +++ b/data/GrabberConfig/ftchinese.com.txt @@ -0,0 +1,18 @@ +# Modified to define the single_page_link +# This filter is tested on: +# http://www.ftchinese.com/story/001047373 +# http://www.ftchinese.com/story/001047631 +# http://www.ftchinese.com/story/001047622/?print=y +# http://www.ftchinese.com/story/001049052 +# http://www.ftchinese.com/story/001049088 + +title:substring-before(//title, '-') +author: //div[@class='byline']/a +date: //a[@class='storytime'] +#Set date in print view +#date: //div[@class='byline']/a/following-sibling::a +body: //div[@id="bodytext"] +strip://div[@class='pagination'] +single_page_link://div[@class='pagination']/a[.='全文'] +#next_page_link: //div[@class='pagination']//a[.='下一页'] +test_url: http://www.ftchinese.com/story/001049088
\ No newline at end of file diff --git a/data/GrabberConfig/futura-sciences.com.txt b/data/GrabberConfig/futura-sciences.com.txt new file mode 100644 index 00000000..e1a15c1d --- /dev/null +++ b/data/GrabberConfig/futura-sciences.com.txt @@ -0,0 +1,19 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http://www.futura-sciences.com/magazines/espace/infos/actu/d/astronomie-surprenants-panaches-ceres-62066/ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' grid ') and (contains(concat(' ',normalize-space(@class),' '),' col-gauche '))] + +strip_id_or_class: ficheprevnext +strip_id_or_class: bar +strip_id_or_class: border-full +strip_id_or_class: httplogbar-wrapper +strip_id_or_class: diaporamafullscreenariane +strip_id_or_class: ariane +strip_id_or_class: diaporamafullscreen +strip_id_or_class: module-propal-nl +strip_id_or_class: readmore-wrapper +strip_id_or_class: bg-light-green + +test_url: http://www.futura-sciences.com/magazines/espace/infos/actu/d/astronomie-surprenants-panaches-ceres-62066/ +test_url: http://www.futura-sciences.com/magazines/high-tech/infos/actu/d/technologie-bientot-plus-besoin-machine-laver-62157/#xtor=RSS-8 diff --git a/data/GrabberConfig/futurezone.at.txt b/data/GrabberConfig/futurezone.at.txt new file mode 100644 index 00000000..207788a8 --- /dev/null +++ b/data/GrabberConfig/futurezone.at.txt @@ -0,0 +1,12 @@ +title: //h1[@class='title'] + +body: //section[@class='section'] +strip: //div[@class='swiper-wrapper'] +#strip: //figcaption[@class='ficaption'] +#strip: //figcaption/span + +date: //span[@class='published-time'] + + +test_url: https://futurezone.at/produkte/lego-verkauft-bugatti-chiron-aus-3599-teilen/400044290 +test_url: https://futurezone.at/science/nasa-verkuendet-am-donnerstag-neue-entdeckung-am-mars/400047008 diff --git a/data/GrabberConfig/gamasutra.com.txt b/data/GrabberConfig/gamasutra.com.txt new file mode 100644 index 00000000..75d46179 --- /dev/null +++ b/data/GrabberConfig/gamasutra.com.txt @@ -0,0 +1,22 @@ +# default view title +title: //span[@class='newsTitle'] +# print view title +title: //h3[@class='title'] + +# default view author +author: //span[@class='newsAuth']/a +author: substring-after(//span[@class='newsAuth'], 'by ') + +# default view date +date: //td[@class='newsDate'] + +# default view body +body: //td[@class='featureText'] +body: //td[@class='newsText'] + +strip: //h3[@class='title'] + +single_page_link: concat(//meta[@property="og:url"]/@content, '?print=1') +if_page_contains: //a[contains(@class, "articleNav")] + +test_url: http://www.gamasutra.com/view/feature/132559/staying_power_rethinking_feedback_.php diff --git a/data/GrabberConfig/gameblog.fr.txt b/data/GrabberConfig/gameblog.fr.txt new file mode 100644 index 00000000..227d39ac --- /dev/null +++ b/data/GrabberConfig/gameblog.fr.txt @@ -0,0 +1,12 @@ +title: //meta[@property="og:title"]/@content +body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')] +author: //span[contains(concat(' ',normalize-space(@class),' '),' author ')] +date: //header[@id='gbArticleHeader']//div//time/@datetime + +prune: no + +strip_id_or_class: noprint +strip: //div[@id='gbNewsTextContent']/following-sibling::* + +test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video +test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible diff --git a/data/GrabberConfig/gamechurch.com.txt b/data/GrabberConfig/gamechurch.com.txt new file mode 100644 index 00000000..c9eea5f8 --- /dev/null +++ b/data/GrabberConfig/gamechurch.com.txt @@ -0,0 +1,10 @@ +title: //h1[@class='title'] + +date: substring-before(substring-after(//div[@class='comment-bubble']/.., 'Posted'), 'by') + +body: //div[@class='the-content'] + +strip: //div[@class='article-image responsive'] + +strip_id_or_class: 'pullquote' +test_url: http://gamechurch.com/virtual-gun-control-the-best-amendment/
\ No newline at end of file diff --git a/data/GrabberConfig/gamedev.net.txt b/data/GrabberConfig/gamedev.net.txt new file mode 100644 index 00000000..7a2c6495 --- /dev/null +++ b/data/GrabberConfig/gamedev.net.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Sourcec: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.gamedev.net%2Fresources%2F_%2Ftechnical%2Fmath-and-physics%2Fpractical-use-of-vector-math-in-games-r2968 + +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-panel ')] +test_url: https://www.gamedev.net/resources/_/technical/math-and-physics/practical-use-of-vector-math-in-games-r2968 diff --git a/data/GrabberConfig/gamekult.com.txt b/data/GrabberConfig/gamekult.com.txt new file mode 100644 index 00000000..6234663a --- /dev/null +++ b/data/GrabberConfig/gamekult.com.txt @@ -0,0 +1,16 @@ +title: //article[@id='story-page']/h1 +date: //p[@class="byline"]/time/@datetime +body: //div[@class="gk__content__container"] | //div[@class="js-pm-refreshed-content"] + +strip: //aside[@id="beginning"] +strip: //select[@id="news_changing_node"] + +requires_login: yes + +login_uri: https://www.gamekult.com/utilisateur/connexion.html +login_username_field: _username +login_password_field: _password + +not_logged_in_xpath: //div[contains(@class,"gk__message--warning")] + +test_url: https://www.gamekult.com/actualite/quand-les-jeux-se-racontent-en-champ-contrechamp-3050810023.html diff --git a/data/GrabberConfig/gamer.no.txt b/data/GrabberConfig/gamer.no.txt new file mode 100644 index 00000000..e76a59d9 --- /dev/null +++ b/data/GrabberConfig/gamer.no.txt @@ -0,0 +1,11 @@ +body: //div[@class='pageContent description'] +date: //div[@class='authorsAndDateTime']/span[@title] +single_page_link: //div[@class='pages']/a[last()-1] + +# fix images and captions +wrap_in(figure): //div[contains(concat(' ', @class, ' '), ' image')] +wrap_in(figcaption): //div[contains(concat(' ', @class, ' '), ' image')]/div[@class='text']/text() + +# get rid of videos +strip_id_or_class: 'video full' +test_url: http://www.gamer.no/artikler/142455/slik-blei-ambisiose-dragons-dogma-skapt/
\ No newline at end of file diff --git a/data/GrabberConfig/gamereactor.no.txt b/data/GrabberConfig/gamereactor.no.txt new file mode 100644 index 00000000..6f7c1b9b --- /dev/null +++ b/data/GrabberConfig/gamereactor.no.txt @@ -0,0 +1,11 @@ +title: //div[@id='content']/div/h1 + +author: //a[@itemprop='reviewer'] + +date: //time[@itemprop='dtreviewed']/@datetime + +body: //div[@id='breadtext'] + +# fix for NOT magically removing anchors with text identical to title +dissolve: //a[text()=//div[@id='content']/div/h1/text()] +test_url: http://www.gamereactor.no/previews/177481/The+Evil+Within/?sid=38b5bd30f56f1b7214de4ff5bed4b76f
\ No newline at end of file diff --git a/data/GrabberConfig/ganglia.info.txt b/data/GrabberConfig/ganglia.info.txt new file mode 100644 index 00000000..2ae27b48 --- /dev/null +++ b/data/GrabberConfig/ganglia.info.txt @@ -0,0 +1,2 @@ +body: //*[(@id = "mid")] +test_url: http://ganglia.info/ diff --git a/data/GrabberConfig/gatopardo.com.txt b/data/GrabberConfig/gatopardo.com.txt new file mode 100644 index 00000000..2ab144f5 --- /dev/null +++ b/data/GrabberConfig/gatopardo.com.txt @@ -0,0 +1,8 @@ +body: //div[@class='panel'] +strip: //div[@style='float:right'] +strip: //span[@class='titulosHomePublicidad'] +strip: //div[@id='TitTop5Der'] +strip: //img[@src='/ImagesGatoPardo/LogoGatopardo.png'] + +prune: yes +test_url: http://www.gatopardo.com/ReportajesGP.php?R=95
\ No newline at end of file diff --git a/data/GrabberConfig/gawker.com.txt b/data/GrabberConfig/gawker.com.txt new file mode 100644 index 00000000..27e4b4bb --- /dev/null +++ b/data/GrabberConfig/gawker.com.txt @@ -0,0 +1,8 @@ +body: //div[@class="post-body"] + +# Remove 'content is restricted' +strip: //div[@id='agegate_IDHERE'] + +http_header(user-agent): PHP/5.3 + +test_url: http://gawker.com/#!5782070/russian-bomb-squad-successfully-defuses-sex-toy
\ No newline at end of file diff --git a/data/GrabberConfig/geeksofdoom.com.txt b/data/GrabberConfig/geeksofdoom.com.txt new file mode 100644 index 00000000..89eb402f --- /dev/null +++ b/data/GrabberConfig/geeksofdoom.com.txt @@ -0,0 +1,3 @@ +author: substring-after(//span[@class='storyauthor'],'Posted by') +date: //span[@class='storydate'] +test_url: http://www.geeksofdoom.com/2012/03/14/robert-rodriguez-says-machete-kills-and-sin-city-2-will-film-this-year/
\ No newline at end of file diff --git a/data/GrabberConfig/geenstijl.nl.txt b/data/GrabberConfig/geenstijl.nl.txt new file mode 100644 index 00000000..a664b4d9 --- /dev/null +++ b/data/GrabberConfig/geenstijl.nl.txt @@ -0,0 +1,3 @@ +body: //div[@id = 'article'] +strip: //div[@id = 'klasbox'] +test_url: http://www.geenstijl.nl/mt/archieven/2010/10/vrouw_lange_frans_wou_baas_b_d.html
\ No newline at end of file diff --git a/data/GrabberConfig/getnews.jp.txt b/data/GrabberConfig/getnews.jp.txt new file mode 100644 index 00000000..e28d4b8b --- /dev/null +++ b/data/GrabberConfig/getnews.jp.txt @@ -0,0 +1,3 @@ +body: //div[@class='post'] +strip: //ul[@id='bookmark_single'] +test_url: http://getnews.jp/archives/117312
\ No newline at end of file diff --git a/data/GrabberConfig/getpocket.com.txt b/data/GrabberConfig/getpocket.com.txt new file mode 100644 index 00000000..e6ca16ae --- /dev/null +++ b/data/GrabberConfig/getpocket.com.txt @@ -0,0 +1 @@ +http_header(user-agent): PHP/5.3
\ No newline at end of file diff --git a/data/GrabberConfig/giantbomb.com.txt b/data/GrabberConfig/giantbomb.com.txt new file mode 100644 index 00000000..61de51b2 --- /dev/null +++ b/data/GrabberConfig/giantbomb.com.txt @@ -0,0 +1,11 @@ +# 2011-11-19 - carlo@... - Initial setup. + +strip_id_or_class: user-review-detail +strip: //h1 + +body: //div[@class="wiki-content"] | //div[@class="section-bd"] | //div[@class="news-story"] + +author: //span[@class="reviewer"] | //p[@class="byline"]/a/text() +date: //span[@class="dtreviewed"] + +test_url: http://www.giantbomb.com/the-elder-scrolls-v-skyrim/61-33394/
\ No newline at end of file diff --git a/data/GrabberConfig/giga.de.txt b/data/GrabberConfig/giga.de.txt new file mode 100644 index 00000000..e2689eae --- /dev/null +++ b/data/GrabberConfig/giga.de.txt @@ -0,0 +1,20 @@ +tidy:no +title://h2[@class="title"] +# author:"Ben Miller" +date://div[@id="stats"]/span +strip_id_or_class:stats +strip_id_or_class:breadcrumbs +strip_id_or_class:gn-why-content +strip_id_or_class:single-social +strip_id_or_class:sidebar-ads +strip_id_or_class:sidebar-top +strip_id_or_class:footer +strip_id_or_class:post_meta +# strip_id_or_class: +# strip_id_or_class: +# strip_id_or_class: +# strip_id_or_class: +# strip_id_or_class: +# strip_id_or_class: + +test_url: http://www.giga.de/benm/2011/10/17/probleme-mit-ios-5-wenn-die-daten-weg-sind/#more-58033
\ No newline at end of file diff --git a/data/GrabberConfig/gigaom.com.txt b/data/GrabberConfig/gigaom.com.txt new file mode 100644 index 00000000..cc8fdfa0 --- /dev/null +++ b/data/GrabberConfig/gigaom.com.txt @@ -0,0 +1,12 @@ +date: //meta[@name='dcterms.created']/@content +title: //meta[@property='og:title']/@content +author: //section[@class="post-meta"]//a[@rel="author"] + +body: //div[starts-with(@id, 'post-content-')] + +strip_id_or_class: sharedaddy + +prune: no + +test_url: http://gigaom.com/2011/10/24/groupon-google-lawsuit/ +test_url: http://gigaom.com/2012/12/26/snapchat-rises-why-pokes-decline-shows-facebooks-inability-to-invent/
\ No newline at end of file diff --git a/data/GrabberConfig/gihyo.jp.txt b/data/GrabberConfig/gihyo.jp.txt new file mode 100644 index 00000000..d3534b29 --- /dev/null +++ b/data/GrabberConfig/gihyo.jp.txt @@ -0,0 +1,3 @@ +single_page_link: //p[@id='skip']//a[contains(@href, 'skip')] + +test_url: http://gihyo.jp/dev/serial/01/machine-learning/0010
\ No newline at end of file diff --git a/data/GrabberConfig/gist.github.com.txt b/data/GrabberConfig/gist.github.com.txt new file mode 100644 index 00000000..90207862 --- /dev/null +++ b/data/GrabberConfig/gist.github.com.txt @@ -0,0 +1,6 @@ +body: //div[@class="highlight"]/pre + +prune: no +tidy: no + +test_url: https://gist.github.com/1258908
\ No newline at end of file diff --git a/data/GrabberConfig/git-scm.com.txt b/data/GrabberConfig/git-scm.com.txt new file mode 100644 index 00000000..89082831 --- /dev/null +++ b/data/GrabberConfig/git-scm.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fgit-scm.com%2Fdocs%2Fgitworkflows + +body: //div[@id='main'] +test_url: https://git-scm.com/docs/gitworkflows diff --git a/data/GrabberConfig/gizmodo.co.uk.txt b/data/GrabberConfig/gizmodo.co.uk.txt new file mode 100644 index 00000000..2eb82a6d --- /dev/null +++ b/data/GrabberConfig/gizmodo.co.uk.txt @@ -0,0 +1,7 @@ +body: //div[@id="leadimage" or @class="postcontent"] +author: //div[@class="contentauthor"] +date: //div[@class="timestamp"] + +prune: no + +test_url: http://www.gizmodo.co.uk/2013/02/bbc-forcing-poor-old-sir-david-attenborough-to-go-on-twitter/
\ No newline at end of file diff --git a/data/GrabberConfig/gizmodo.com.au.txt b/data/GrabberConfig/gizmodo.com.au.txt new file mode 100644 index 00000000..b4c818a6 --- /dev/null +++ b/data/GrabberConfig/gizmodo.com.au.txt @@ -0,0 +1,12 @@ +body: //div[@id='content_post' or @class="post-body" or contains(@class, 'illustration top')] +author: (//cite//span[@class="plus-icon"])[1] +date: //span[@class="date"] +date: //time + +find_string: <meta http-equiv="refresh" +replace_string: <meta http-equiv="disabled-refresh" + +prune: no + +test_url: http://www.gizmodo.com.au/2016/01/these-hateful-eightfigures-are-delightfully-retro/ +test_contains: NECA have fully unveiled diff --git a/data/GrabberConfig/gizmodo.com.txt b/data/GrabberConfig/gizmodo.com.txt new file mode 100644 index 00000000..c98d1ce3 --- /dev/null +++ b/data/GrabberConfig/gizmodo.com.txt @@ -0,0 +1,19 @@ +#body: //div[@class="post-body" or contains(@class, 'illustration top')] +body: //div[contains(@class, 'image-annotation-box') or contains(@class, 'post-content')] +#author: (//cite//span[@class="plus-icon"])[1] +author: //span[contains(@class, 'display-name')] +date: //span[@class="date"] + +strip_id_or_class: related +strip: //aside +strip: //svg +# For Gumbo parsing <svg>, <math> namespaced elems +strip: //*[local-name() = 'svg'] + +prune: no + +http_header(user-agent): PHP/5.3 + +test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science +test_url: http://gizmodo.com/what-van-goghs-paintings-would-look-like-if-they-came-874035680 +test_url: http://gizmodo.com/rss diff --git a/data/GrabberConfig/gizmodo.uol.com.br.txt b/data/GrabberConfig/gizmodo.uol.com.br.txt new file mode 100644 index 00000000..d963d684 --- /dev/null +++ b/data/GrabberConfig/gizmodo.uol.com.br.txt @@ -0,0 +1,6 @@ +title: //h1 + +body: //div[@id='destaques']//div[contains(@class, 'img')] | //div[@id='maincontent']//p + +test_url: http://gizmodo.uol.com.br/nvidia-gtx-titan-z/ +test_url: http://gizmodo.uol.com.br/perfil-mark-zuckerberg-hackeado/ diff --git a/data/GrabberConfig/gizmologia.com.txt b/data/GrabberConfig/gizmologia.com.txt new file mode 100644 index 00000000..d2c7c9f9 --- /dev/null +++ b/data/GrabberConfig/gizmologia.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://gizmologia.com/2011/09/amd-trinity-el-sucesor-de-llano-en-una-demostracion-muy-interesante
\ No newline at end of file diff --git a/data/GrabberConfig/gizmovil.com.txt b/data/GrabberConfig/gizmovil.com.txt new file mode 100644 index 00000000..5fc204b8 --- /dev/null +++ b/data/GrabberConfig/gizmovil.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://gizmovil.com/2011/09/hipertextual-labs-receptor-bluetooth-nokia-bh-214
\ No newline at end of file diff --git a/data/GrabberConfig/global.txt b/data/GrabberConfig/global.txt new file mode 100644 index 00000000..17274545 --- /dev/null +++ b/data/GrabberConfig/global.txt @@ -0,0 +1,27 @@ +# Look for Open Graph data - http://ogp.me +title: //meta[@property="og:title"]/@content +date: //meta[@property="article:published_time"]/@content +# article:author is someties URL, e.g. on guardian.co.uk + +# Remove Google Publisher Tags: https://support.google.com/dfp_sb/answer/1649768?hl=en +#strip_id_or_class: div-gpt-ad + +# Strip doubleclick image ads +strip_image_src: doubleclick.net + +# If you get chunks of Javascript code appearing in the extracted output, try uncommenting the lines below. +# This tries to convert script tags to hidden div elements (which Full-Text RSS removes). +# If you notice issues with this approach, please let us know. +#find_string: <script +#replace_string: <div style="display:none" +#find_string: </script> +#replace_string: </div> + +# convert amp image tag to html image tag +find_string: <amp-img +replace_string: <img +find_string: </amp-img> +replace_string: <!-- nothing --> + +# strip all class attributes after processing (not supported in Full-Text RSS yet) +post_strip_attr: //*/@class diff --git a/data/GrabberConfig/globalgrind.com.txt b/data/GrabberConfig/globalgrind.com.txt new file mode 100644 index 00000000..e2f4e233 --- /dev/null +++ b/data/GrabberConfig/globalgrind.com.txt @@ -0,0 +1,6 @@ +body: //div[contains(@class, 'content-body')] + +prune: no + +test_url: http://globalgrind.com/2015/04/26/listen-jeremih-featuring-chance-the-rapper-the-social-experiment-planes-remix-new-music/ +test_contains: The Chicago rapper has made a name for himself
\ No newline at end of file diff --git a/data/GrabberConfig/globalissues.org.txt b/data/GrabberConfig/globalissues.org.txt new file mode 100644 index 00000000..ee50f68f --- /dev/null +++ b/data/GrabberConfig/globalissues.org.txt @@ -0,0 +1,15 @@ +body: //div[@id='content'] + +strip: //p[@class='top'] +strip: //h2[.='Where next?'] +strip_id_or_class: where-next +strip_id_or_class: social-bookmarks +strip_id_or_class: link-to-here +strip_id_or_class: options-heading +strip_id_or_class: page-options-content +strip_id_or_class: page-info-bottom + +tidy: no +prune: no + +test_url: http://www.globalissues.org/article/39/a-primer-on-neoliberalism
\ No newline at end of file diff --git a/data/GrabberConfig/globalresearch.ca.txt b/data/GrabberConfig/globalresearch.ca.txt new file mode 100644 index 00000000..dbf784f1 --- /dev/null +++ b/data/GrabberConfig/globalresearch.ca.txt @@ -0,0 +1,4 @@ +http_header(user-agent): Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 + +test_url: http://www.globalresearch.ca/fallujah-is-being-slaughtered-silently/5495520 +test_contains: what is unknown about this city diff --git a/data/GrabberConfig/globoesporte.globo.com.txt b/data/GrabberConfig/globoesporte.globo.com.txt new file mode 100644 index 00000000..fd8e70ff --- /dev/null +++ b/data/GrabberConfig/globoesporte.globo.com.txt @@ -0,0 +1,25 @@ +title: //h1[@class="entry-title"] + +body: //div[@class='materia-titulo']/h2 | //*[@id="materia-letra"] + +date: //abbr[@class="published"] +date: //abbr[@class="updated"] + +author: //*[@class="author"]/strong + +strip: //div[contains(@class,'foto')]/strong +strip: //div[contains(@class,'frase-materia')]/div[@class='autor'] +strip: //div[contains(@class,'saibamais')] +strip: //*[contains(text(),'Clique aqui e veja mais')]/ancestor::p +strip: //ul[@class="toolbar"] + +# quotes +wrap_in(blockquote): //div[@id='materia-letra']//div[contains(@class,'frase-materia')]/div[@class='frase'] + +prune: no + +replace_string([Clique aqui e veja mais vídeos do Fluminense]): [] + +test_url: http://globoesporte.globo.com/atletismo/noticia/2013/08/michael-johnson-diz-que-bolt-e-melhor-da-historia-nao-ha-duvidas.html +test_url: http://globoesporte.globo.com/futebol/futebol-internacional/futebol-espanhol/noticia/2013/08/barca-atropela-levante-e-neymar-passa-em-branco-em-estreia-oficial.html +test_url: http://globoesporte.globo.com/futebol/times/fluminense/noticia/2013/08/poupado-no-sabado-felipe-se-diz-pronto-para-ser-titular-contra-o-goias.html diff --git a/data/GrabberConfig/gloswielkopolski.pl.txt b/data/GrabberConfig/gloswielkopolski.pl.txt new file mode 100644 index 00000000..16487955 --- /dev/null +++ b/data/GrabberConfig/gloswielkopolski.pl.txt @@ -0,0 +1,8 @@ +title: //article[@id='material']/header/h1 +author: //article[@id='material']/header/div[2]/p +date: //article[@id='material']/header/p/time[1] +body: //section[@id='tresc'] +next_page_link: .//section[@id='tresc']/div[@class='stronicowanie']/a[@rel='next'] +strip://div[@class='podobneSonda'] + +test_url: http://www.gloswielkopolski.pl/artykul/803547,abc-telemarketingu-praca-ktora-zwalnia-z-myslenia,id,t.html
\ No newline at end of file diff --git a/data/GrabberConfig/gnppn.fr.txt b/data/GrabberConfig/gnppn.fr.txt new file mode 100644 index 00000000..068a7c6a --- /dev/null +++ b/data/GrabberConfig/gnppn.fr.txt @@ -0,0 +1,5 @@ +title: //h1[@class="entry-title"] +author: //p[@class="site-title"] +date: //time[@class="entry-date published"]/@datetime + +test_url: https://gnppn.fr/2018/10/21/quotidiens-nationaux-et-abonnement-en-ligne-prime-a-la-penibilite/ diff --git a/data/GrabberConfig/goal.com.txt b/data/GrabberConfig/goal.com.txt new file mode 100644 index 00000000..e25e9a00 --- /dev/null +++ b/data/GrabberConfig/goal.com.txt @@ -0,0 +1,16 @@ +title: //div[@id='article_headline']//h1 +date: //div[contains(@class, 'articleDate')]//h4 +body: //div[@id='article_headline']/h2 | //div[@id='large_article_image' or @id='article_content'] + +strip_id_or_class: relatedLinksBox +strip_id_or_class: betting-widget +strip_image_src: install_flash.gif + +strip: //table[contains(@style, 'float: right; width: 285px;')] +strip: //div[@class='caption'] + +tidy: no +prune: no + +test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139032/video-profile-back-to-his-very-best-for-bayern-frances-flair-and- +test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139869/lampard-injury-a-bitter-blow-for-england-and-sorry-way-to#
\ No newline at end of file diff --git a/data/GrabberConfig/gocomics.com.txt b/data/GrabberConfig/gocomics.com.txt new file mode 100644 index 00000000..212c02d5 --- /dev/null +++ b/data/GrabberConfig/gocomics.com.txt @@ -0,0 +1,5 @@ +body: //a[@class="photo"]/img[@class="strip"] +author: //meta[@name="author"]/@content +date: //meta[@property="gocomics:publish_date"]/@content + +test_url: http://www.gocomics.com/garfield/2015/06/13 diff --git a/data/GrabberConfig/gokulkrishh.github.io.txt b/data/GrabberConfig/gokulkrishh.github.io.txt new file mode 100644 index 00000000..b395fa9e --- /dev/null +++ b/data/GrabberConfig/gokulkrishh.github.io.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fgokulkrishh.github.io%2FJavascript-Guidelines%2F + +body: //section[contains(concat(' ',normalize-space(@class),' '),' main-content ')] +test_url: https://gokulkrishh.github.io/Javascript-Guidelines/ diff --git a/data/GrabberConfig/golem.de.txt b/data/GrabberConfig/golem.de.txt new file mode 100644 index 00000000..62af3030 --- /dev/null +++ b/data/GrabberConfig/golem.de.txt @@ -0,0 +1,43 @@ +# Author: zinnober +# Rewrite of original template which fetched the printer-version without pictures + +tidy: no +prune: no + +# Set full title +title: //h1 + +date: //time +author: //a[@rel='author'] + +# Content is here +body: //article + +# Fetch full multipage articles +next_page_link: //a[@id='atoc_next'] + +# Remove tracking and ads +strip_id_or_class: iqadtile4 + +# General Cleanup +strip_id_or_class: list-jtoc +strip_id_or_class: table-jtoc +strip_id_or_class: implied +strip_id_or_class: social- +strip_id_or_class: comments +strip_id_or_class: footer +strip_id_or_class: job-market +strip_id_or_class: tags + +# Tidy up galleries (could still be improved, though) +strip: //img[@src=''] +# This removes valid lists: +# https://www.golem.de/news/sap-berater-der-coolste-job-nach-tourismusmanager-und-bierbrauer-1807-135389-2.html +# strip: //li[not(*)] +strip: //div[contains(@style,'margin')] +strip: //figure[contains(@id,'gvideo')] + + +# Try yourself +test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html +test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html diff --git a/data/GrabberConfig/good.is.txt b/data/GrabberConfig/good.is.txt new file mode 100644 index 00000000..94159fbf --- /dev/null +++ b/data/GrabberConfig/good.is.txt @@ -0,0 +1,4 @@ +title: //div[@class="title"]/div/h1 +body: //div[@class="body"] +date: //li[@class="date-time"] +test_url: http://www.good.is/post/why-amazon-is-the-next-top-tech-company/
\ No newline at end of file diff --git a/data/GrabberConfig/goodfil.ms.txt b/data/GrabberConfig/goodfil.ms.txt new file mode 100644 index 00000000..f8bbbc6a --- /dev/null +++ b/data/GrabberConfig/goodfil.ms.txt @@ -0,0 +1,2 @@ +strip_id_or_class: gutter +test_url: http://goodfil.ms/blog/posts/2012/08/13/angularjs-and-the-goodfilms-mobile-site-part-1/
\ No newline at end of file diff --git a/data/GrabberConfig/gossip-tv.gr.txt b/data/GrabberConfig/gossip-tv.gr.txt new file mode 100644 index 00000000..e2d2d0b2 --- /dev/null +++ b/data/GrabberConfig/gossip-tv.gr.txt @@ -0,0 +1,14 @@ +date: //meta[@name='og:article:published_time']/@value + +body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] + +strip_id_or_class: itemImageGallery + +# remove extras at end of post content +find_string: <div style="margin:5px 0 10px;"> +replace_string: </div></body></html><!-- + +prune: no + +test_url: http://www.gossip-tv.gr/story/158902/aggelike-daliane-semera-duskoleuontai-oloi-sta-epaggelmatika-tous +test_url: http://www.gossip-tv.gr/lifestyle/Taste/story/230266/lahtaristo-kai-ygieino-tost-sokolatas
\ No newline at end of file diff --git a/data/GrabberConfig/goteborgsfria.se.txt b/data/GrabberConfig/goteborgsfria.se.txt new file mode 100644 index 00000000..c90aed0b --- /dev/null +++ b/data/GrabberConfig/goteborgsfria.se.txt @@ -0,0 +1,7 @@ +body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] +author: //article//div[contains(@class, 'field-byline')] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.goteborgsfria.se/artikel/112079
\ No newline at end of file diff --git a/data/GrabberConfig/gothamist.com.txt b/data/GrabberConfig/gothamist.com.txt new file mode 100644 index 00000000..36453878 --- /dev/null +++ b/data/GrabberConfig/gothamist.com.txt @@ -0,0 +1,7 @@ +title: //div[@class='entry-header'] +author: //span[@class='vcard author'] +date: //abbr[@class='published'] +#move_into(//div[@class='entry-body']): //img[@id='photo_1'] +body: //div[@class='entry-body'] +strip: //div[@class='galleryEaseThumbs'] +test_url: http://gothamist.com/2012/03/15/fancy_cocktail_lounge_the_randolph.php
\ No newline at end of file diff --git a/data/GrabberConfig/gp.se.txt b/data/GrabberConfig/gp.se.txt new file mode 100644 index 00000000..158ae4ed --- /dev/null +++ b/data/GrabberConfig/gp.se.txt @@ -0,0 +1,11 @@ +body: //div[@id='articleContainer'] +author: //div[@id='articleContent']//div[contains(@class, 'byline')]//span[contains(@class, 'name fn')] +strip_id_or_class: toolbar +strip_id_or_class: ADad +strip_id_or_class: articleSerieWrapper +strip_id_or_class: articleFloatContainer +strip: //div[contains(@class, 'byline')]//img +prune: no + +test_url: http://www.gp.se/nyheter/bohuslan/1.2045564-styckade-mannen-hade-mordat-hustrun +test_url: http://www.gp.se/1.16560
\ No newline at end of file diff --git a/data/GrabberConfig/gq.com.txt b/data/GrabberConfig/gq.com.txt new file mode 100644 index 00000000..8ad8a14e --- /dev/null +++ b/data/GrabberConfig/gq.com.txt @@ -0,0 +1,9 @@ +next_page_link: //div[@class='pagination']//span[@class='paginationNext']/a +strip_id_or_class: utility +strip_id_or_class: keywords +strip_id_or_class: pagination +strip_id_or_class: position2_content +body: //div[@class='article'] +title: //h1[@class='content-headline'] +author: //span[@class='contributor']//a +test_url: http://www.gq.com/news-politics/newsmakers/201203/terry-thompson-ohio-zoo-massacre-chris-heath-gq-february-2012
\ No newline at end of file diff --git a/data/GrabberConfig/grantland.com.txt b/data/GrabberConfig/grantland.com.txt new file mode 100644 index 00000000..b8d419f4 --- /dev/null +++ b/data/GrabberConfig/grantland.com.txt @@ -0,0 +1,20 @@ +# this is fragile with footnotes -- leave it for now + +#tidy: no +#prune: no +#move_into(//article): //aside[@id='footnotes'] +author: //cite/a +date: //time + +strip: //a[text()='Grantland'] +strip_id_or_class: ad-wrapper +strip_id_or_class: fb-connect-link +strip_id_or_class: fb-status +strip: //li[@class='print'] +strip: //cite +strip: //a[contains(text(), '[+]')] +strip: //a[@id='jump-nav-link'] +strip: //h1[text()='Share This'] +strip: //h1[text()='Top Stories'] +strip: //div[@id="update-text-size"] +test_url: http://www.grantland.com/story/_/id/8421241/examining-new-albums-rock-veterans-no-doubt-green-day
\ No newline at end of file diff --git a/data/GrabberConfig/greatergreaterwashington.org.txt b/data/GrabberConfig/greatergreaterwashington.org.txt new file mode 100644 index 00000000..31a41075 --- /dev/null +++ b/data/GrabberConfig/greatergreaterwashington.org.txt @@ -0,0 +1,11 @@ +title: //div[@class="blogpost"]/h2 +author: //div[@class="blogpost"]/p[@class="byline"]/a +date: //div[@class="blogpost"]/p[@class="byline"]/span[@class="time_posted"] +body: //div[@class="blogpost"] +strip_id_or_class: flag +strip_id_or_class: byline +strip_id_or_class: post_footer +strip_id_or_class: related_posts +strip_id_or_class: post_author_bios +strip: //h2 +test_url: http://greatergreaterwashington.org/post/12457/ask-ggw-what-will-happen-to-the-1000-series-railcars/
\ No newline at end of file diff --git a/data/GrabberConfig/groups.drupal.org.txt b/data/GrabberConfig/groups.drupal.org.txt new file mode 100644 index 00000000..0fe30ef5 --- /dev/null +++ b/data/GrabberConfig/groups.drupal.org.txt @@ -0,0 +1,5 @@ +title://h1 +author://span[@class="submitted"]/a +date:substring-after(//span[@class="submitted"],'on ') +body://div[@class="content"] +test_url: http://groups.drupal.org/node/36816
\ No newline at end of file diff --git a/data/GrabberConfig/grubstreet.com.txt b/data/GrabberConfig/grubstreet.com.txt new file mode 100644 index 00000000..f9dbc265 --- /dev/null +++ b/data/GrabberConfig/grubstreet.com.txt @@ -0,0 +1,18 @@ +#copied from nymag.com.txt + +title: //h2[contains(@class, 'primary')] +body: //*[@itemprop="articleBody"] +body: //div[@id='story'] +author: //*[@class='by']/a +date: substring-after(//*[@class='date'], 'Published') + +#Skip GDPR warning +http_header(Cookie): nymuc=11111111111 + +parser: html5php +tidy: no + +next_page_link: //div[@class='page-navigation']//li[@class='next']/a + +test_url: http://www.grubstreet.com/2018/06/anthony-bourdain-and-the-silent-epidemic-of-male-suicide.html +test_contains: Bourdain was a truth-telling globe-trotter diff --git a/data/GrabberConfig/grumpygamer.com.txt b/data/GrabberConfig/grumpygamer.com.txt new file mode 100644 index 00000000..9bea56f6 --- /dev/null +++ b/data/GrabberConfig/grumpygamer.com.txt @@ -0,0 +1,5 @@ +title: //h3[@class="post-title"] +author: "Ron Gilbert" +body: //div[@class="grumpypost"] + +test_url: https://grumpygamer.com/scope_budget_schedule diff --git a/data/GrabberConfig/gsmarena.com.txt b/data/GrabberConfig/gsmarena.com.txt new file mode 100644 index 00000000..34883965 --- /dev/null +++ b/data/GrabberConfig/gsmarena.com.txt @@ -0,0 +1,3 @@ +next_page_link: //a[@class='pages-next'] + +test_url: http://www.gsmarena.com/samsung_galaxy_j2-review-1348.php
\ No newline at end of file diff --git a/data/GrabberConfig/gulfnews.com.txt b/data/GrabberConfig/gulfnews.com.txt new file mode 100644 index 00000000..97b620de --- /dev/null +++ b/data/GrabberConfig/gulfnews.com.txt @@ -0,0 +1,5 @@ +body: //div[@class='wrapper_half']//ul[@class='details'] | //div[@class='wrapper_half']//p[@class='synopsis'] | //div[@class='wrapper_half']//div[@class='image'] | //div[@class='wrapper_half']//div[@class='article'] +strip: //div[@class='wrapper_half']//ul[@class='details']/li[position()>1] +prune: no +tidy: no +test_url: http://gulfnews.com/news/gulf/uae/government/abu-dhabi-centre-offers-useful-information-1.811084
\ No newline at end of file diff --git a/data/GrabberConfig/guokr.com.txt b/data/GrabberConfig/guokr.com.txt new file mode 100644 index 00000000..f8327bea --- /dev/null +++ b/data/GrabberConfig/guokr.com.txt @@ -0,0 +1,22 @@ +# To administrator: +# Please change the hostname to "www.guokr.com/article/*" +# Not working for "www.guokr.com/post/" pages configured by carlosliu913@gmail.com + +# This filter is tested on: +# http://www.guokr.com/article/274325/ +# http://www.guokr.com/article/275013/ + +title://h1 +author://div[contains(@class, 'content-th-info')]/a +date://div[contains(@class, 'content-th-info')]/span +body://div[contains(@class, 'Content')] + +strip://div[contains(@class, 'bottom-i')] +strip://div[contains(@class, 'copyright')] +strip://div[contains(@class, 'fr')] +strip://div[contains(@class, 'content-th-info')] +strip://h1[contains(@id, 'articleTitle')] +strip://div[contains(@class, 'side')] +strip://div[contains(@class, 'top-wp')] +test_url: http://www.guokr.com/article/275013/ +test_url: http://www.guokr.com/article/338387/
\ No newline at end of file diff --git a/data/GrabberConfig/gurumed.org.txt b/data/GrabberConfig/gurumed.org.txt new file mode 100644 index 00000000..9e775fd3 --- /dev/null +++ b/data/GrabberConfig/gurumed.org.txt @@ -0,0 +1,6 @@ +prune: no +body: //div[@class='entry'] +strip: //div[@class='addthis_toolbox'] +strip: //div[@class='yarpp-related'] + +test_url: http://www.gurumed.org/2015/06/22/nous-entrons-dsormais-dans-la-sixime-extinction-massive/ diff --git a/data/GrabberConfig/gurusblog.com.txt b/data/GrabberConfig/gurusblog.com.txt new file mode 100644 index 00000000..3cbdcd83 --- /dev/null +++ b/data/GrabberConfig/gurusblog.com.txt @@ -0,0 +1,4 @@ +title: //h1[@class='entry-title'] +body: //div[@class='content'] + +test_url: https://www.gurusblog.com/archives/las-subastas-de-joyas-de-christies-y-sothebys-rompen-todos-los-records-historicos/15/11/2018/ diff --git a/data/GrabberConfig/guyaweb.com.txt b/data/GrabberConfig/guyaweb.com.txt new file mode 100644 index 00000000..0335658b --- /dev/null +++ b/data/GrabberConfig/guyaweb.com.txt @@ -0,0 +1,27 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' news-entry ')] + +strip_id_or_class: headline +strip_id_or_class: post-meta +strip_id_or_class: post-info +strip_id_or_class: share-container +strip: //div[contains(concat(' ',normalize-space(@class),' '),' reserved ')]/following-sibling::* +strip_id_or_class: reserved +strip: //div[contains(concat(' ',normalize-space(@class),' '),' form-inner ')]/parent::div/following-sibling::* +strip: //div[contains(concat(' ',normalize-space(@class),' '),' form-inner ')]/parent::div + +test_url: http://www.guyaweb.com/actualites/news/culture/caravane-top-vakans-sillone-guyane/ + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' reserved ')] +login_uri: https://www.guyaweb.com/ +login_username_field: username +login_password_field: password +login_extra_fields: crb_login_user=1 +login_extra_fields: submit=ok + +test_url: http://www.guyaweb.com/actualites/news/culture/gabriel-serville-regrette-suppression-de-france-o/ + diff --git a/data/GrabberConfig/haberler.com.txt b/data/GrabberConfig/haberler.com.txt new file mode 100644 index 00000000..1bb2bc7d --- /dev/null +++ b/data/GrabberConfig/haberler.com.txt @@ -0,0 +1,5 @@ +title: //div[@id="habermetni"]/h1[@id="haber_baslik"] +body: //div[@id="habermetni"]/p +strip: //img[@class='newsDetailLeft'] +strip_image_src: /haber-resimleri/ +test_url: http://www.haberler.com/emniyete-atacakti-elinde-patladi-3198733-haberi/
\ No newline at end of file diff --git a/data/GrabberConfig/habrahabr.ru.txt b/data/GrabberConfig/habrahabr.ru.txt new file mode 100644 index 00000000..7c88dbc0 --- /dev/null +++ b/data/GrabberConfig/habrahabr.ru.txt @@ -0,0 +1,11 @@ +title: //span[@class="post_title"] +author: //div[@class="author"] +date: //div[@class="published"] + +body: //div[@class="post__text"] + +prune: no +tidy: no + +test_url: http://habrahabr.ru/post/229883/ +test_url: https://habrahabr.ru/company/mailru/blog/324078/ diff --git a/data/GrabberConfig/hackersrepublic.org.txt b/data/GrabberConfig/hackersrepublic.org.txt new file mode 100644 index 00000000..da01c213 --- /dev/null +++ b/data/GrabberConfig/hackersrepublic.org.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.hackersrepublic.org%2Fculture-du-hacking%2Fune-histoire-de-privileges + +body: //div[contains(concat(' ',normalize-space(@class),' '),' field-name-body ')]//div[contains(concat(' ',normalize-space(@class),' '),' field-items ')]//div[contains(concat(' ',normalize-space(@class),' '),' field-item ') and (contains(concat(' ',normalize-space(@class),' '),' even '))] +test_url: http://www.hackersrepublic.org/culture-du-hacking/une-histoire-de-privileges diff --git a/data/GrabberConfig/hackmake.org.txt b/data/GrabberConfig/hackmake.org.txt new file mode 100644 index 00000000..98140117 --- /dev/null +++ b/data/GrabberConfig/hackmake.org.txt @@ -0,0 +1,7 @@ +date: //article//time[@pubdate] +body: //article/div[@id="post-wide"] +title: //article/header/h2 +strip: /div[@id="comment"] +strip: //footer +author: substring-after(//footer/p[@class='byline'] , 'By') +test_url: http://hackmake.org/2012/12/21/mindfulness-of-concentration
\ No newline at end of file diff --git a/data/GrabberConfig/hacks.mozilla.org.txt b/data/GrabberConfig/hacks.mozilla.org.txt new file mode 100644 index 00000000..3c6012c4 --- /dev/null +++ b/data/GrabberConfig/hacks.mozilla.org.txt @@ -0,0 +1,5 @@ +title: //div[@id="content-head"]//h1 +date: //abbr[@class='published']/@title +author: //h3[@class='post__author']//a[@class='url'] + +test_url: https://hacks.mozilla.org/2018/01/making-webassembly-even-faster-firefoxs-new-streaming-and-tiering-compiler/ diff --git a/data/GrabberConfig/halo.bungie.org.txt b/data/GrabberConfig/halo.bungie.org.txt new file mode 100644 index 00000000..1802efea --- /dev/null +++ b/data/GrabberConfig/halo.bungie.org.txt @@ -0,0 +1,5 @@ +title:substring-before(id("maincontent")/table, 'Posted') +body:id("maincontent")/p +# eventually convert linebreaks better + +test_url: http://halo.bungie.org/fanfic/?story=Delahunt0312112316071.html
\ No newline at end of file diff --git a/data/GrabberConfig/handelsblatt.com.txt b/data/GrabberConfig/handelsblatt.com.txt new file mode 100644 index 00000000..7d067aa6 --- /dev/null +++ b/data/GrabberConfig/handelsblatt.com.txt @@ -0,0 +1,31 @@ +#Single Page +single_page_link: //li[contains(@class,"hcf-print")]/a + +# Title hcf-headline +title: //span[@class='hcf-headline'] + +# Authors +author: //div[@class="hcf-author"]/a/text() +author: substring-after(//div[@class='hcf-author'], 'von ') + +# Date +date: //div[@class='hcf-article-date'] + +# Body +body: //div[@class='article'] + +# General removements +strip: //div[contains(@class,"hcf-smartbox")] +strip: //div[contains(@class,"hcf-stopper")] +strip: //div[contains(@class,"hcf-img-controls")] +strip: //span[@class='hcf-location-mark'] +strip: //span[@class='hcf-copyright'] +strip: //div[@class='hcf-copyright'] +strip: //div[@class='hcf-origin'] + + + + +# Fix picture captions +wrap_in(small): //div[@class="hcf-caption"] +test_url: http://www.handelsblatt.com/meinung/gastbeitraege/gastkommentar-zum-emissionshandel-kurskorrekturen-fuehren-zum-kentern/8044326.html
\ No newline at end of file diff --git a/data/GrabberConfig/hanselman.com.txt b/data/GrabberConfig/hanselman.com.txt new file mode 100644 index 00000000..1dca632f --- /dev/null +++ b/data/GrabberConfig/hanselman.com.txt @@ -0,0 +1,4 @@ +date: //span[@class="item-date"] +body: //div[@class="item-content"] +strip_comments: no +test_url: http://www.hanselman.com/blog/BrainBytesBackBunsTheProgrammersPriorities.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/happyassassin.net.txt b/data/GrabberConfig/happyassassin.net.txt new file mode 100644 index 00000000..cba02c64 --- /dev/null +++ b/data/GrabberConfig/happyassassin.net.txt @@ -0,0 +1,3 @@ +http_header(user-agent): PHP/7.2 + +test_url: https://www.happyassassin.net/2014/01/25/uefi-boot-how-does-that-actually-work-then/ diff --git a/data/GrabberConfig/hardware-infos.com.txt b/data/GrabberConfig/hardware-infos.com.txt new file mode 100644 index 00000000..1d63b6cd --- /dev/null +++ b/data/GrabberConfig/hardware-infos.com.txt @@ -0,0 +1,10 @@ +tidy: no +prune: no + +title: //div[@class='content post']/h1 +body: //div[@class='post'] + +next_page_link: //a[preceding::div[@class='pages']] + +test_url: http://www.hardware-infos.com/news/5646/amd-bei-next-gen-api-mit-besserer-gpu-auslastung.html +test_url: http://www.hardware-infos.com/tests/grafikkarten/sapphire-r9-280x-vapor-x-tri-x-oc.html diff --git a/data/GrabberConfig/hardware.fr.txt b/data/GrabberConfig/hardware.fr.txt new file mode 100644 index 00000000..e4f1f6bc --- /dev/null +++ b/data/GrabberConfig/hardware.fr.txt @@ -0,0 +1,6 @@ +title: //h1 +author: //a[@class='a_aut'] +body: //div[@class='content_dossier'] +strip: //div[@id='pagination'] +next_page_link: //div[@class='sommaire_colonne']//span[@class='page_actuelle']/following::span[@class='autres_page']//a/@href +test_url: http://www.hardware.fr/articles/850-1/pci-express-3-0-impact-performances.html
\ No newline at end of file diff --git a/data/GrabberConfig/hardware.no.txt b/data/GrabberConfig/hardware.no.txt new file mode 100644 index 00000000..cbbcf84e --- /dev/null +++ b/data/GrabberConfig/hardware.no.txt @@ -0,0 +1,16 @@ +title: //h1[@class='headline'] +title: //h2[@itemprop='alternativeHeadline'] +title: //h1[@itemprop='headline'] +author: //span[@itemprop='name'] +date: //time[@itemprop='datePublished'] +body: //div[@itemprop='reviewBody'] + +wrap_in(blockquote): //div[@class='factBox'] + +next_page_link: //a[@rel='next'] + +strip_id_or_class: 'product-box' +strip: //a[@rel='next'] +strip: //a[text()='Del på Facebook'] +strip: //a[text()='Del på Twitter'] +test_url: http://www.hardware.no/artikler/asus-vg248qe/132792
\ No newline at end of file diff --git a/data/GrabberConfig/hardwareluxx.de.txt b/data/GrabberConfig/hardwareluxx.de.txt new file mode 100644 index 00000000..227bea77 --- /dev/null +++ b/data/GrabberConfig/hardwareluxx.de.txt @@ -0,0 +1,19 @@ +tidy: no +prune: no + +title: (//span[@itemprop='title'])[last()] +author: //span[@itemprop='author']//span[@itemprop='name'] +body: //article +date: //time + +strip: //header +strip: //div[@id='bcr'] +strip: //footer +strip: //section +strip: //figure[@class='dynbench'] +strip: //div[@class='pagenavbar'] + +next_page_link: //span[@class='next']/a + +test_url: http://www.hardwareluxx.de/index.php/news/allgemein/netzpolitik/35169-creator-space-youtube-eroeffnet-neues-studio-in-berlin.html +test_url: http://www.hardwareluxx.de/index.php/artikel/hardware/komplettsysteme/35020-caseking-king-mod-titan-oc-gaming-pc-im-test.html diff --git a/data/GrabberConfig/hazlitt.net.txt b/data/GrabberConfig/hazlitt.net.txt new file mode 100644 index 00000000..b360fc1a --- /dev/null +++ b/data/GrabberConfig/hazlitt.net.txt @@ -0,0 +1,5 @@ +body: //div[@id='article-wrap'] +title: //h1[@class='article-title'] +author: //div[@class='article-footer']//div[@class='author-name']/a + +test_url: https://hazlitt.net/longreads/real-lolita diff --git a/data/GrabberConfig/hbr.org.txt b/data/GrabberConfig/hbr.org.txt new file mode 100644 index 00000000..c2f292e1 --- /dev/null +++ b/data/GrabberConfig/hbr.org.txt @@ -0,0 +1,7 @@ +title: //div[@id='article-title'] +author: //div[@id='articleAuthors'] +body: //div[@id='article'] +strip: //div[@class='module wide'] +#single_page_link: //a[@class='social-print'] +test_url: http://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs/ar/ +test_url: http://hbr.org/2013/03/big-bang-disruption/ar/
\ No newline at end of file diff --git a/data/GrabberConfig/headrush.typepad.com.txt b/data/GrabberConfig/headrush.typepad.com.txt new file mode 100644 index 00000000..a3146771 --- /dev/null +++ b/data/GrabberConfig/headrush.typepad.com.txt @@ -0,0 +1,14 @@ +title://div[@class='content']/h3[1] +body://div[@class='content'] + +# Article nav +strip://div[@class='content']/p[1] + +# Comments and trackbacks +strip://h2/following-sibling::p +strip://h2 + +# Posted on +strip://b/p +strip://div[@class='content']/p[@class='posted'] +test_url: http://headrush.typepad.com/creating_passionate_users/2005/05/the_case_for_ea.html
\ No newline at end of file diff --git a/data/GrabberConfig/health.com.txt b/data/GrabberConfig/health.com.txt new file mode 100644 index 00000000..3425b20d --- /dev/null +++ b/data/GrabberConfig/health.com.txt @@ -0,0 +1,4 @@ +http_header(cookie): euConsent=true; euConsentId=61c78ceb-c244-4016-b707-3c640ba09311 + +test_url: https://www.health.com/news/cell-phone-elbow-new-ill-wired-age +test_contains: As symptoms progress diff --git a/data/GrabberConfig/healthland.time.com.txt b/data/GrabberConfig/healthland.time.com.txt new file mode 100644 index 00000000..0542097b --- /dev/null +++ b/data/GrabberConfig/healthland.time.com.txt @@ -0,0 +1,10 @@ +date: //span[@class = 'date'] +body: //div[@class = 'entry-content'] +strip://div[@class='more-ways'] +strip://div[@id = 'stayConnected'] +strip://p[child::a[@rel = 'bookmark']] +strip://p[starts-with(string(.),'(MORE:')] +strip://p[starts-with(string(.),'(PHOTOS:')] +move_into(//p[../@class = 'entry-content'][position() = last()])://div[@id = 'featbox'] + +test_url: http://healthland.time.com/2011/07/24/amy-winehouse-and-the-pain-of-addiction/?preview=true&preview_id=39210&preview_nonce=0777d4e408
\ No newline at end of file diff --git a/data/GrabberConfig/heise-online.mobi.txt b/data/GrabberConfig/heise-online.mobi.txt new file mode 100644 index 00000000..daff6143 --- /dev/null +++ b/data/GrabberConfig/heise-online.mobi.txt @@ -0,0 +1,3 @@ +body: //div[@id='content']/div +date: //p[@class='author_date']/span[@class='date'] +test_url: http://heise-online.mobi/newsticker/meldung/Amazons-Appstore-in-der-Kritik-Ein-Desaster-fuer-Kunden-und-Entwickler-1273936.html
\ No newline at end of file diff --git a/data/GrabberConfig/heise.de.txt b/data/GrabberConfig/heise.de.txt new file mode 100644 index 00000000..e5e88c81 --- /dev/null +++ b/data/GrabberConfig/heise.de.txt @@ -0,0 +1,70 @@ +# Author: zinnober +# Template should work well with either desktop or mobile version (m.heise.de) + +prune: no + +date: //p[@class='news_datum'] +author: //span[@class='author'] + +body: //article | //div[@class='meldung_wrapper'] | //section[@id='artikel_text'] + +strip: //nav + +# General cleanup +strip: //time +strip: //header +strip: //h4[@class='author'] +strip: //div[@class='gallery compact']/h3 +strip: //div[@class='gallery compact']/figcaption +strip: //p[@class='news_datum'] +strip: //p[@class='artikel_datum'] +strip: //p[@class='news_navi'] +strip: //p[@class='printversion'] +strip: //a[contains(@href, 'mailto')] +strip: //div[@class='gallery compact']/h2 +strip: //p[@class='themen_foren'] +strip: //style +strip: //span[@class='source'] +#strip: //div[@class='gallery compact']/figcaption +strip_id_or_class: comments +strip_id_or_class: ISI_IGNORE +#strip_id_or_class: clear + +strip_id_or_class: linkurl_grossbild +strip_id_or_class: image-num +strip_id_or_class: heisebox_right +strip_id_or_class: dossier +strip_id_or_class: latest_posting_snippet +strip_id_or_class: a-pvgs +strip_id_or_class: a-pvg__body + +# Strip Ads +strip_id_or_class: ad_ + +# Some optimizations +replace_string(<h5>): <h2> +replace_string(</h5>): </h2> +replace_string(<title>Druckversion - ): <title> +replace_string( | heise online</title>): </title> +replace_string( | c't Magazin</title>): </title> +replace_string( | Telepolis</title>): </title> +replace_string( | heise Security</title>): </title> +replace_string( | heise Autos</title>): </title> +# this line breaks the parser +#replace_string(<span class="bild_rechts" style="width:): <p " +replace_string(<div class="heisebox">): <blockquote> + +single_page_link: //a[contains(@href, '?view=print')] +single_page_link: //a[contains(@title, 'Druck')] + +next_page_link: //a[@class='next' and not(contains(text(), 'Artikel'))] +next_page_link: //a[@title='vor'] +next_page_link: //a[@rel='next'] + +test_url: http://www.heise.de/open/artikel/Die-Neuerungen-von-Linux-3-15-2196231.html +test_url: http://m.heise.de/open/artikel/Die-Neuerungen-von-Linux-3-15-2196231.html +test_url: http://www.heise.de/newsticker/meldung/Ueberwachungstechnik-Die-globale-Handy-Standortueberwachung-2301494.html +test_url: http://www.heise.de/newsticker/meldung/Bodenradar-fuer-selbstfahrende-Autos-horcht-unter-die-Strasse-3273941.html +test_url: http://www.heise.de/tp/artikel/49/49473/1.html +test_url: http://www.heise.de/ct/artikel/Die-Neuerungen-von-Linux-3-15-2196231.html +test_url: http://heise.de/-3527918 diff --git a/data/GrabberConfig/help.fivefilters.org.txt b/data/GrabberConfig/help.fivefilters.org.txt new file mode 100644 index 00000000..70a7d156 --- /dev/null +++ b/data/GrabberConfig/help.fivefilters.org.txt @@ -0,0 +1,2 @@ +title: //div[@class="title"]/h3 +date: substring-after(//div[@class="meta"], ": ") diff --git a/data/GrabberConfig/hemmings.com.txt b/data/GrabberConfig/hemmings.com.txt new file mode 100644 index 00000000..a02b4a62 --- /dev/null +++ b/data/GrabberConfig/hemmings.com.txt @@ -0,0 +1,9 @@ +title: //h2 +body: //div[@id='leftdetail'] +single_page_link: //a[contains(@href, 'printable=1')] +strip: //a[contains(., 'Full Version')] + +prune: no + +test_url: http://www.hemmings.com/classifieds/dealer/ferrari/330gtc/1601235.html +test_url: http://www.hemmings.com/rss/keyword.xml?adtype=carsforsale&make=ferrari
\ No newline at end of file diff --git a/data/GrabberConfig/hespress.com.txt b/data/GrabberConfig/hespress.com.txt new file mode 100644 index 00000000..4ed0b8b5 --- /dev/null +++ b/data/GrabberConfig/hespress.com.txt @@ -0,0 +1,7 @@ +body: //div[@id='article_holder']//div[@class='image'] | //div[@id='article_body'] + +prune: no +tidy: no + +test_url: http://hespress.com/videos/73684.html +test_url: http://hespress.com/permalink/73678.html
\ No newline at end of file diff --git a/data/GrabberConfig/highscalability.com.txt b/data/GrabberConfig/highscalability.com.txt new file mode 100644 index 00000000..5a808fa4 --- /dev/null +++ b/data/GrabberConfig/highscalability.com.txt @@ -0,0 +1,3 @@ +body: //div[@class='journal-entry-text'] + +test_url: http://highscalability.com/blog/2011/3/14/6-lessons-from-dropbox-one-million-files-saved-every-15-minu.html
\ No newline at end of file diff --git a/data/GrabberConfig/hiiraan.com.txt b/data/GrabberConfig/hiiraan.com.txt new file mode 100644 index 00000000..cf1f7942 --- /dev/null +++ b/data/GrabberConfig/hiiraan.com.txt @@ -0,0 +1,10 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.hiiraan.com%2Fnews%2F2014%2FDec%2Fwararka_maanta20-89428.htm + +body: //div[contains(concat(' ',normalize-space(@class),' '),' single ')]//div[contains(concat(' ',normalize-space(@class),' '),' description ')] + +prune: no + +test_url: http://www.hiiraan.com/news/2014/Dec/wararka_maanta20-89428.htm +test_url: http://rss.hiiraan.com/wararka_maanta_rss.xml
\ No newline at end of file diff --git a/data/GrabberConfig/hiperpop.com.txt b/data/GrabberConfig/hiperpop.com.txt new file mode 100644 index 00000000..b5eb062e --- /dev/null +++ b/data/GrabberConfig/hiperpop.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://hiperpop.com/2011/09/marc-anthony-celebra-su-cumpleanos-con-jennifer-lopez
\ No newline at end of file diff --git a/data/GrabberConfig/hipertextual.com.txt b/data/GrabberConfig/hipertextual.com.txt new file mode 100644 index 00000000..2cb7ca51 --- /dev/null +++ b/data/GrabberConfig/hipertextual.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://hipertextual.com/2011/09/banda-ancha-en-america-latina-insignificante diff --git a/data/GrabberConfig/hiphopleeft.nl.txt b/data/GrabberConfig/hiphopleeft.nl.txt new file mode 100644 index 00000000..d869a866 --- /dev/null +++ b/data/GrabberConfig/hiphopleeft.nl.txt @@ -0,0 +1,4 @@ +body: //div[@class = 'pd'] +strip: //div[@id = 'overzicht-albumrecensies'] +strip: //div[@id = 'jc'] +test_url: http://hiphopleeft.nl/index.php?option=com_content&view=article&id=2767:mark-ronson-record-collection&catid=66:m&Itemid=142
\ No newline at end of file diff --git a/data/GrabberConfig/histoire.presse.fr.txt b/data/GrabberConfig/histoire.presse.fr.txt new file mode 100644 index 00000000..bbd1cbd6 --- /dev/null +++ b/data/GrabberConfig/histoire.presse.fr.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.histoire.presse.fr%2Factualite%2Finfos%2Fgeorges-clemenceau-chambre-30-juillet-1885-22-11-2010-15540 + +body: //div[contains(concat(' ',normalize-space(@class),' '),' nd-no-sidebars ')]//div[contains(concat(' ',normalize-space(@class),' '),' nd-region-middle ')] +test_url: http://www.histoire.presse.fr/actualite/infos/georges-clemenceau-chambre-30-juillet-1885-22-11-2010-15540 diff --git a/data/GrabberConfig/historytoday.com.txt b/data/GrabberConfig/historytoday.com.txt new file mode 100644 index 00000000..78fb60a6 --- /dev/null +++ b/data/GrabberConfig/historytoday.com.txt @@ -0,0 +1,10 @@ +body://div[@id = 'content'] +author://span[@class = 'authors'] +author://span[@class = 'ht-vtag'][1] +date:substring-before(//meta[@name = 'dc.date']/@content,'T') +strip://div[contains(@class, 'region-ubercontent')] +strip://h1 +strip://div[@id = 'ht-author'] +strip://ul[@class = 'links inline'] +strip://div[@id = 'ht-tools'] +test_url: http://www.historytoday.com/carol-dyhouse/skin-deep-fall-fur
\ No newline at end of file diff --git a/data/GrabberConfig/hmercer.com.txt b/data/GrabberConfig/hmercer.com.txt new file mode 100644 index 00000000..2da13a8e --- /dev/null +++ b/data/GrabberConfig/hmercer.com.txt @@ -0,0 +1,5 @@ +title: //*[@class='ptitle'] +date: //span[@class='date'] +body: //div[@class='body'] +prune: no +test_url: http://hmercer.com/2011/07/why-i-switched-to-jekyll/
\ No newline at end of file diff --git a/data/GrabberConfig/hollywoodlife.com.txt b/data/GrabberConfig/hollywoodlife.com.txt new file mode 100644 index 00000000..975ffa26 --- /dev/null +++ b/data/GrabberConfig/hollywoodlife.com.txt @@ -0,0 +1,22 @@ +date: //meta[@name='sailthru.date']/@content +body: //article[contains(@class, 'entry-content')] + +strip_image_src: subscribe.png + +strip_id_or_class: wpcom-iframe-form +strip_id_or_class: gallery-thumbs +strip_id_or_class: twitter +strip_id_or_class: fb-link +strip_id_or_class: pinterest + +strip: //div[@class='data'] +strip: //iframe[contains(@name, 'wpcom')] + +find_string: <a href="http://www.youtube.com/subscription_center?add_user_id=2rJLq19N0dGrxfib80M +replace_string: </p></div></body></html><!-- + +find_string: <h3>More +replace_string: </div></body></html><!-- + +test_url: http://hollywoodlife.com/2013/10/04/miriam-carey-dead-capitol-hill-car-chase-shooting-postpartum-depression/ +test_url: http://hollywoodlife.com/feed/
\ No newline at end of file diff --git a/data/GrabberConfig/hometheaterreview.com.txt b/data/GrabberConfig/hometheaterreview.com.txt new file mode 100644 index 00000000..8ed26ff5 --- /dev/null +++ b/data/GrabberConfig/hometheaterreview.com.txt @@ -0,0 +1,4 @@ +body: //div[@id='entry-body'] +strip_id_or_class: paginate +strip: //p[contains(., 'Additional Resources')] +test_url: http://hometheaterreview.com/dreamvision-starlight-3-three-chip-d-ila-projector-reviewed/
\ No newline at end of file diff --git a/data/GrabberConfig/hosted.ap.org.txt b/data/GrabberConfig/hosted.ap.org.txt new file mode 100644 index 00000000..2a285f89 --- /dev/null +++ b/data/GrabberConfig/hosted.ap.org.txt @@ -0,0 +1,5 @@ +body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='entry-content'] +tidy: no +strip_image_src: analytics.apnewsregistry + +test_url: http://hosted.ap.org/dynamic/stories/L/LT_MEXICO_MIGRANT_CARAVAN?SITE=PALEH&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2018-04-29-14-34-45 diff --git a/data/GrabberConfig/hosted2.ap.org.txt b/data/GrabberConfig/hosted2.ap.org.txt new file mode 100644 index 00000000..0ca94d34 --- /dev/null +++ b/data/GrabberConfig/hosted2.ap.org.txt @@ -0,0 +1,7 @@ +body: //div[contains(@id, 'photosDiv-')] | //div[contains(@id, 'storyBodyDiv')]//p[contains(@class, 'entry-content')] +tidy: no +prune: no + +test_url: http://hosted2.ap.org/KSSUC/*/Article_2015-09-24-FBN-Skyboard-Craze/id-589cf87959414856b5cd5e1ad869c68e + +test_url: http://hosted2.ap.org/APDefault/*/Article_2015-09-28-US--United%20States-Russia/id-79126486540a4f2ca0b48133013a8b03 diff --git a/data/GrabberConfig/houstonchronicle.com.txt b/data/GrabberConfig/houstonchronicle.com.txt new file mode 100644 index 00000000..3d65c985 --- /dev/null +++ b/data/GrabberConfig/houstonchronicle.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='subsection_wrap'] +next_page_link: //ul[@class='pagination']//a[contains(text(), '»')] + +test_url: http://www.houstonchronicle.com/nasa/adrift/1/ diff --git a/data/GrabberConfig/howtogeek.com.txt b/data/GrabberConfig/howtogeek.com.txt new file mode 100644 index 00000000..baa2ed4a --- /dev/null +++ b/data/GrabberConfig/howtogeek.com.txt @@ -0,0 +1,11 @@ +body: //div[contains(@class, 'thecontent')] + +strip_image_src: loading.gif +find_string:src="http://cdn.howtogeek.com/public/images/blank.gif" +replace_string:- +find_string:data-href= +replace_string:src= + +strip_id_or_class: relatedside + +test_url: http://www.howtogeek.com/school/microsoft-excel-formulas-and-functions/lesson1/
\ No newline at end of file diff --git a/data/GrabberConfig/hs.fi.txt b/data/GrabberConfig/hs.fi.txt new file mode 100644 index 00000000..360dc725 --- /dev/null +++ b/data/GrabberConfig/hs.fi.txt @@ -0,0 +1,3 @@ +prune: yes +tidy: yes +test_url: http://www.hs.fi/kotimaa/Teollisuushallin%20palo%20levitt%C3%A4%C3%A4%20vaarallista%20savua%20Tuusulassa/a1305571582405
\ No newline at end of file diff --git a/data/GrabberConfig/ht.ly.txt b/data/GrabberConfig/ht.ly.txt new file mode 100644 index 00000000..46535088 --- /dev/null +++ b/data/GrabberConfig/ht.ly.txt @@ -0,0 +1,3 @@ +single_page_link: //iframe[@id='hootFrame']/@src + +test_url: http://ht.ly/bOiZV
\ No newline at end of file diff --git a/data/GrabberConfig/huffingtonpost.co.uk.txt b/data/GrabberConfig/huffingtonpost.co.uk.txt new file mode 100644 index 00000000..1c818c2a --- /dev/null +++ b/data/GrabberConfig/huffingtonpost.co.uk.txt @@ -0,0 +1,5 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry__body ')] +strip_id_or_class: tag-cloud +strip_id_or_class: below-entry + +test_url: http://www.huffingtonpost.co.uk/entry/edward-snowdens-david-cameron-tweet-tells-public-to-rise-up-if-they-want-him-to-resign_uk_57074b52e4b00c769e2d91a9?s481714i diff --git a/data/GrabberConfig/huffingtonpost.com.txt b/data/GrabberConfig/huffingtonpost.com.txt new file mode 100644 index 00000000..69b6b10c --- /dev/null +++ b/data/GrabberConfig/huffingtonpost.com.txt @@ -0,0 +1,21 @@ +title: //meta[@property="og:title"]/@content +body: //div[img[starts-with(@id, 'img_caption')]] | //div[@class="big_photo"] | //div[contains(@class, 'entry_body_text')] +date: //meta[@name="publish_date"]/@content +author: //a[@rel="author"] +author: //meta[@name="author"]/@content + +prune: no +tidy: no + +strip: //footer +strip_id_or_class: ps-slideshow +strip_id_or_class: fs-slideshow +strip: //p[contains(., 'Related on HuffPost:')] +strip_id_or_class: contribute-story +strip_id_or_class: promo_holder + +# end early +replace_string(<div class="sbm-main): </body></html><div class="not-interested + +test_url: http://www.huffingtonpost.com/mitch-moxley/tracking-beijings-boom-th_b_1209828.html +test_url: http://www.huffingtonpost.com/2012/09/11/president-obama-iphone-throwdown_n_1873826.html diff --git a/data/GrabberConfig/huffingtonpost.fr.txt b/data/GrabberConfig/huffingtonpost.fr.txt new file mode 100644 index 00000000..d6f6b941 --- /dev/null +++ b/data/GrabberConfig/huffingtonpost.fr.txt @@ -0,0 +1,5 @@ +strip_id_or_class: tag-cloud +strip_id_or_class: follow-us__correction +strip_id_or_class: corrections_container + +test_url: http://www.huffingtonpost.fr/michael-dias/pourquoi-la-generation-y-est-elle-en-train-de-demissionner/ diff --git a/data/GrabberConfig/humanite.fr.txt b/data/GrabberConfig/humanite.fr.txt new file mode 100644 index 00000000..24222cf9 --- /dev/null +++ b/data/GrabberConfig/humanite.fr.txt @@ -0,0 +1,21 @@ +title: //meta[@property='og:title']/@content + +author: //div[@id='content']//div[contains(concat(' ',normalize-space(@class),' '),' field-name-field-news-auteur ')]//a + +body: //div[@id='content']//div[contains(concat(' ',normalize-space(@class),' '),' field-name-field-news-chapo ')] | //div[@id='content']//div[contains(concat(' ',normalize-space(@class),' '),' field-name-field-news-text ')] | //div[@id='content']//div[contains(concat(' ',normalize-space(@class),' '),' field-name-field-news-encadre ')] + +strip_id_or_class: field-name-block-similar-contents +strip_id_or_class: field-name-field-news-auteur-nom-trias + +test_url: https://www.humanite.fr/avignon-korsunovas-tire-bout-portant-sur-tous-les-tartuffe-658245 + +# Wallabag-specific login directives (not supported in FTR): +requires_login: yes +login_uri: https://www.humanite.fr/user/login +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' group-paywall ')] +login_username_field: name +login_password_field: pass +login_extra_fields: form_build_id=@=xpath('//form[@id="user-login"]//input[@name="form_build_id"]', request_html('https://www.humanite.fr/user/login')) +login_extra_fields: form_id=user_login +login_extra_fields: op=Se connecter +test_url: https://www.humanite.fr/musique-arpenter-la-diversite-humaine-au-festival-jazz-des-cinq-continents-658280 diff --git a/data/GrabberConfig/humantransit.org.txt b/data/GrabberConfig/humantransit.org.txt new file mode 100644 index 00000000..92d3c678 --- /dev/null +++ b/data/GrabberConfig/humantransit.org.txt @@ -0,0 +1,5 @@ +title: //h3[@class="entry-header"] +date: //h2[@class="date-header"] +body: //div[contains(@class, 'entry')] + +test_url: http://www.humantransit.org/2012/06/can-network-primers-reduce-grief-about-network-design.html
\ No newline at end of file diff --git a/data/GrabberConfig/hurriyet.com.tr.txt b/data/GrabberConfig/hurriyet.com.tr.txt new file mode 100644 index 00000000..68fd220a --- /dev/null +++ b/data/GrabberConfig/hurriyet.com.tr.txt @@ -0,0 +1,7 @@ +title: //div[@class='HaberDetayTitleHold Title']/h1 +body: //div[@id='YazarDetayText'] +author: //div[@class='HaberDetayTitleHold Title']/h1 +prune: no + +test_url: http://www.hurriyet.com.tr/ekonomi/19490260.asp +test_url: http://www.hurriyet.com.tr/yazarlar/22078439.asp
\ No newline at end of file diff --git a/data/GrabberConfig/hvg.hu.txt b/data/GrabberConfig/hvg.hu.txt new file mode 100644 index 00000000..05e7b5f1 --- /dev/null +++ b/data/GrabberConfig/hvg.hu.txt @@ -0,0 +1,9 @@ +title: //div[@id='pg-content']//h1 +body: //div[@id='articleBody0'] +replace_string(</table>): </table><br /><br /> + +single_page_link: //div[@class="up-header"]/a + +prune: no + +test_url: http://hvg.hu/w/20111125_sparta
\ No newline at end of file diff --git a/data/GrabberConfig/hypebeast.com.txt b/data/GrabberConfig/hypebeast.com.txt new file mode 100644 index 00000000..23e47545 --- /dev/null +++ b/data/GrabberConfig/hypebeast.com.txt @@ -0,0 +1,10 @@ +body: //div[@id='content']//div[contains(@class, 'wp-image-') or contains(@class, 'entry')][1] +author: //span[@class='author']/a + +strip_id_or_class: disqus +strip_id_or_class: paginator +strip_id_or_class: photo-number + +prune: no + +test_url: http://hypebeast.com/2012/11/stussy-2012-fall-winter-november-releases/
\ No newline at end of file diff --git a/data/GrabberConfig/iansommerville.com.txt b/data/GrabberConfig/iansommerville.com.txt new file mode 100644 index 00000000..82a4055e --- /dev/null +++ b/data/GrabberConfig/iansommerville.com.txt @@ -0,0 +1,3 @@ +http_header(user-agent): PHP/5.3 + +test_url: http://iansommerville.com/blog/a-guide-to-scottish-delicacies-for-tgo-challengers/ diff --git a/data/GrabberConfig/icannabis.tumblr.com.txt b/data/GrabberConfig/icannabis.tumblr.com.txt new file mode 100644 index 00000000..3bda753c --- /dev/null +++ b/data/GrabberConfig/icannabis.tumblr.com.txt @@ -0,0 +1,9 @@ +tidy:no +prune:no + +body://div[contains(@id,'content')] + +strip_id_or_class:meta +strip_id_or_class:notes +strip_id_or_class:pagination +test_url: http://icannabis.tumblr.com/post/28660592471/reviewmswireless3000
\ No newline at end of file diff --git a/data/GrabberConfig/ici.radio-canada.ca.txt b/data/GrabberConfig/ici.radio-canada.ca.txt new file mode 100644 index 00000000..37d3c07a --- /dev/null +++ b/data/GrabberConfig/ici.radio-canada.ca.txt @@ -0,0 +1,12 @@ +body: //article +body: //p[@class='TexteChronique'] +body: //div[@class='src-content'] + +strip: //header +strip: //figure +strip: //div[@class='framed'] +strip: //form + +test_url: http://ici.radio-canada.ca/nouvelle/1003322/lexique-mots-neige-hiver-guy-bertrand +test_url: http://ici.radio-canada.ca/tele/deuxieme-chance/inscription/ +test_url: http://ici.radio-canada.ca/emissions/aujourd_hui_l_histoire/2016-2017/chronique.asp?idChronique=423294 diff --git a/data/GrabberConfig/idealog.co.nz.txt b/data/GrabberConfig/idealog.co.nz.txt new file mode 100644 index 00000000..ca88f606 --- /dev/null +++ b/data/GrabberConfig/idealog.co.nz.txt @@ -0,0 +1,12 @@ +body: //div[@class='content'] + +strip: //p[@class='dateline'] +strip: //hr +strip_id_or_class: share +strip_id_or_class: comments +strip_id_or_class: tags + +title: substring-before(//title,' ::') +author: substring-before(//p[@class='dateline'],',') +date: //p[@class='dateline']/time +test_url: http://www.idealog.co.nz/blog/2012/12/geeks-plane-help-kiwis-take-san-francisco
\ No newline at end of file diff --git a/data/GrabberConfig/idlewords.com.txt b/data/GrabberConfig/idlewords.com.txt new file mode 100644 index 00000000..f3b33796 --- /dev/null +++ b/data/GrabberConfig/idlewords.com.txt @@ -0,0 +1,7 @@ +title: //a[@class='post_title'] +body: //div[@class='entrybox'] +strip_id_or_class: post_title +date: //div[@class='entrybox']/b[1] +strip: //div[@class='entrybox']/b[1] +author: string('Maciej Cegłowski') +test_url: http://idlewords.com/2011/08/why_arabic_is_terrific.htm
\ No newline at end of file diff --git a/data/GrabberConfig/igen.fr.txt b/data/GrabberConfig/igen.fr.txt new file mode 100644 index 00000000..b6a2cffc --- /dev/null +++ b/data/GrabberConfig/igen.fr.txt @@ -0,0 +1,8 @@ +title: //section[@class='titre']/h1 +author: //span[@class="infos"]/span[@class="username"] +date: //span[@class="infos"]/time/@datetime +body: //section[@class="corps"] + +strip: //select[@id="news_changing_node"] + +test_url: http://www.igen.fr/itunes/2016/05/itunes-mis-jour-la-semaine-prochaine-contre-le-bug-de-suppression-de-musique-95803 diff --git a/data/GrabberConfig/igeneration.fr.txt b/data/GrabberConfig/igeneration.fr.txt new file mode 100644 index 00000000..45dd5f25 --- /dev/null +++ b/data/GrabberConfig/igeneration.fr.txt @@ -0,0 +1,5 @@ +author: substring-after(substring-after(//span[@class='submitted'],'- '),'- ') +date: substring-before(//span[@class='submitted'], concat('- ',substring-after(substring-after(//span[@class='submitted'],'- '),'- '))) +body: //div[@class='content clear-block zoneApple'] + +test_url: http://www.igeneration.fr/iphone/l-iphone-et-l-ipad-chouchous-des-tpe-et-pme-55112
\ No newline at end of file diff --git a/data/GrabberConfig/ilounge.com.txt b/data/GrabberConfig/ilounge.com.txt new file mode 100644 index 00000000..9880b51f --- /dev/null +++ b/data/GrabberConfig/ilounge.com.txt @@ -0,0 +1,13 @@ +# Get proper Title, Author and Date info +title: substring-before(//title, '|') +author: substring-after(//h4/a[@href='http://www.ilounge.com/index.php/ilounge/aboutus/'], 'By') +date: //span[@class='instapaper_date'] + +# For Reviews & First Looks, get the intro paragraph and put it in front of the main body. +move_into(//div[@id='instapaper_para1']): //div[@id='instapaper_body'] +body: //div[@id='instapaper_para1'] +strip: //div[@class='reviewinfo'] + +# We don't use footnotes, so why bother checking for them? +footnotes: no +test_url: http://www.ilounge.com/index.php/reviews/entry/luxa2-alum-x-for-iphone-4-4s/?utm_source=twitterfeed&utm_medium=twitter
\ No newline at end of file diff --git a/data/GrabberConfig/ilyabirman.ru.txt b/data/GrabberConfig/ilyabirman.ru.txt new file mode 100644 index 00000000..51a7eb9c --- /dev/null +++ b/data/GrabberConfig/ilyabirman.ru.txt @@ -0,0 +1,5 @@ +title: //div[@class='published visible e2-smart-title']//span +author: //span[@id='e2-blog-title'] +date: //p[@class='super-h'] +body: //div[@class='text published visible'] +test_url: http://ilyabirman.ru/meanwhile/2011/11/15/2/
\ No newline at end of file diff --git a/data/GrabberConfig/in.rbth.com.txt b/data/GrabberConfig/in.rbth.com.txt new file mode 100644 index 00000000..1c0fff30 --- /dev/null +++ b/data/GrabberConfig/in.rbth.com.txt @@ -0,0 +1,6 @@ +title: //h1[contains(concat(' ',normalize-space(@class),' '),' title ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' lead ')] | //section[contains(concat(' ',normalize-space(@class),' '),' text ')] +strip: //div(@class='incut i_right') | //div[contains(concat(' ',normalize-space(@class),' '),' mastertag_pic ')] +date: //span[contains(concat(' ',normalize-space(@class),' '),' date ')] +author: //span[contains(concat(' ',normalize-space(@class),' '),' author ')] +test_url: https://in.rbth.com/politics/2016/04/26/is-revolution-in-russia-by-emigres-abroad-possible_588071 diff --git a/data/GrabberConfig/inc.com.txt b/data/GrabberConfig/inc.com.txt new file mode 100644 index 00000000..5410e64e --- /dev/null +++ b/data/GrabberConfig/inc.com.txt @@ -0,0 +1,21 @@ +author: substring-after(substring-before(//div[@id='byline'],'|'),'By') +author: //div[@class='byline']/a +date: //span[@class='pubdate'] +# print friendly page +body: //div[@id='text'] +# regular page +body: //div[@id= 'articlecontent'] + +strip: //div[@id= 'articlecontent']/h1 +strip: //div[@id='articlecontent']/p[@class='deck'] +strip: //div[@id='articlecontent']/div[@class='byline'] +strip: //div[@id='articlespacer'] +strip: //div[@id='incsharebox'] +strip: //div[@id='articlesidebar'] + +prune: no + +single_page_link: //a[contains(@href, 'Printer_Friendly.html')] +strip: //a[contains(., 'Dig Deeper')] +test_url: http://www.inc.com/guides/2010/11/seven-tips-for-lobbying-politicians.html +test_url: http://www.inc.com/eric-schurenberg/startups-are-we-geting-irrationally-exuberant.html
\ No newline at end of file diff --git a/data/GrabberConfig/indehekken.net.txt b/data/GrabberConfig/indehekken.net.txt new file mode 100644 index 00000000..93a9f219 --- /dev/null +++ b/data/GrabberConfig/indehekken.net.txt @@ -0,0 +1,3 @@ +body: //div[@class='post-entry']/p + +test_url: http://www.indehekken.net/you-only-sing-when-youre-rowing/ diff --git a/data/GrabberConfig/independent.co.uk.txt b/data/GrabberConfig/independent.co.uk.txt new file mode 100644 index 00000000..d142d81d --- /dev/null +++ b/data/GrabberConfig/independent.co.uk.txt @@ -0,0 +1,18 @@ +title: //meta[@property='og:title']/@content +body: //div[@itemprop="articleBody"] +body: //img[contains(@class, 'FirstImage')] | //div[contains(@class, 'articleContent')] +date: //meta[@property='article:published_time']/@content +author: //div[@id='main']//div[@class='byline']//span[@class='authorName'] + +strip_id_or_class: RelatedArtTag +strip_id_or_class: syndication-btn + +strip: //h5[contains(., 'READ MORE:')] +strip: //h5[contains(., 'Read more:')] + +tidy: no +test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html +test_url: http://www.independent.co.uk/voices/comment/robert-fisk-on-the-cia-torture-report-once-again-language-is-distorted-in-order-to-hide-us-state-wrongdoing-9924501.html +test_contains: Thank God for Noam Chomsky. + +test_url: http://www.independent.co.uk/news/uk/rss diff --git a/data/GrabberConfig/indiatimes.com.txt b/data/GrabberConfig/indiatimes.com.txt new file mode 100644 index 00000000..8112105f --- /dev/null +++ b/data/GrabberConfig/indiatimes.com.txt @@ -0,0 +1,6 @@ +body: //figure[@class='mainVideo'] +strip: //figcaption + +prune: no + +test_url: http://www.indiatimes.com/bollywood/kareena-insecure-about-saif-working-with-bipasha-23386.html
\ No newline at end of file diff --git a/data/GrabberConfig/indiehackers.com.txt b/data/GrabberConfig/indiehackers.com.txt new file mode 100644 index 00000000..3fd970b1 --- /dev/null +++ b/data/GrabberConfig/indiehackers.com.txt @@ -0,0 +1,6 @@ +prune: no +body: //div[@itemprop="articleBody"] +strip: //aside +strip_id_or_class: partner-spot + +test_url: https://www.indiehackers.com/businesses/paleo-meal-plans diff --git a/data/GrabberConfig/inessential.com.txt b/data/GrabberConfig/inessential.com.txt new file mode 100644 index 00000000..52252455 --- /dev/null +++ b/data/GrabberConfig/inessential.com.txt @@ -0,0 +1,5 @@ +title: //div[@class='weblogPost']/h3[1] +author: ("Brent Simmons") +date: //span[@class="weblogPostDisplayDate"] +body: //div[@class='weblogPostBody'] +test_url: http://inessential.com/2011/10/25/why_just_store_the_app_data_on_dropbo
\ No newline at end of file diff --git a/data/GrabberConfig/infoq.com.txt b/data/GrabberConfig/infoq.com.txt new file mode 100644 index 00000000..f4a328a6 --- /dev/null +++ b/data/GrabberConfig/infoq.com.txt @@ -0,0 +1,14 @@ +body: //div[@id="intTranscript"] +body: //div[@class="box-content"] +title: //div[@class="box-content"]//h1[1] +author: //p[@class="info"]/strong +date: substring-before(substring-after(//p[@class="info"], "on"), "Length") +strip: //div[@class="box-content"]//h1[1] +strip: //div[@class="box-content"]//p[@class="info"] +strip_id_or_class: vendor-content-box +strip_id_or_class: tags2 +strip_id_or_class: instructions +strip_id_or_class: comments +strip_id_or_class: forum-list-tree +strip: //div[@class="addthis_toolbox addthis_default_style"] +test_url: http://www.infoq.com/interviews/oleg-zhurakousky-javaone2011-interview
\ No newline at end of file diff --git a/data/GrabberConfig/informador.com.mx.txt b/data/GrabberConfig/informador.com.mx.txt new file mode 100644 index 00000000..77987493 --- /dev/null +++ b/data/GrabberConfig/informador.com.mx.txt @@ -0,0 +1,9 @@ +title: //div[@class='tituloInt'] +body: //div[@class='notaPortada'] +strip: //img[@id='imgHorizontalInt imgDetalleImg imagenNota'] +date: //span[@class='publi'] +author: //span[@class='autor'] +tidy: no +prune: no + +test_url: http://www.informador.com.mx/tecnologia/2011/337606/6/iran-desarrolla-antivirus-tras-afectaciones-por-duqu.htm
\ No newline at end of file diff --git a/data/GrabberConfig/information.dk.txt b/data/GrabberConfig/information.dk.txt new file mode 100644 index 00000000..3ade754d --- /dev/null +++ b/data/GrabberConfig/information.dk.txt @@ -0,0 +1,7 @@ +title: //meta[@property='og:title']/@content +author: //*[@property='dc:creator'] +date: //*[@property='dc:date']/@content +body: //div[@id='page-content']//div[contains(@class, 'article-body')] + +tidy: no +test_url: http://www.information.dk/282307
\ No newline at end of file diff --git a/data/GrabberConfig/informationarchitects.net.txt b/data/GrabberConfig/informationarchitects.net.txt new file mode 100644 index 00000000..1330a040 --- /dev/null +++ b/data/GrabberConfig/informationarchitects.net.txt @@ -0,0 +1,10 @@ +title://h1[@class="post_title"] +body://article[@class="post"] +date://h1[@class="section_separator"] +author://span[@class="post_author"] +strip://nav[@class="arrow_nav"] +strip://section[@id="contact"] +strip_id_or_class:post_title +strip_id_or_class:post_author +strip_id_or_class:section_separator +test_url: http://informationarchitects.net/blog/nzz-relaunch-a-quick-review/
\ No newline at end of file diff --git a/data/GrabberConfig/informationclearinghouse.info.txt b/data/GrabberConfig/informationclearinghouse.info.txt new file mode 100644 index 00000000..60b798e6 --- /dev/null +++ b/data/GrabberConfig/informationclearinghouse.info.txt @@ -0,0 +1,6 @@ +title: //head/title +body: //table[@id='table3']//div[@class='postContent'] +prune: no +tidy: no + +test_url: http://www.informationclearinghouse.info/article28238.htm
\ No newline at end of file diff --git a/data/GrabberConfig/informit.com.txt b/data/GrabberConfig/informit.com.txt new file mode 100644 index 00000000..567bc6db --- /dev/null +++ b/data/GrabberConfig/informit.com.txt @@ -0,0 +1,7 @@ +title: //div[@id='content']/h1 +body: //div[@id="content"] +strip: //img[contains(@src, 'informit_printer.png')] +single_page_link: //div[contains(@class, 'articleTools')]//a[contains(@href, '/printerfriendly/')] +prune: no + +test_url: http://www.informit.com/articles/article.aspx?p=1729268 diff --git a/data/GrabberConfig/infoworld.com.txt b/data/GrabberConfig/infoworld.com.txt new file mode 100644 index 00000000..766dfbab --- /dev/null +++ b/data/GrabberConfig/infoworld.com.txt @@ -0,0 +1,19 @@ +# All sites of the IDG network can be extracted using the same rules, +# make sure to update all of them + +author: //meta[@name="author"]/@content +date: //meta[@name="DC.date.issued"]/@content + +body: //div[@itemprop="articleBody"] +body: //div[@itemprop="reviewBody"] +body: //figcaption|//div[@class="img-wrapper"]/noscript/img + +next_page_link: //a[@rel="next"] + +strip: //aside +strip: //h3[contains(., "See also:")] +strip: //div[@id="article-top-page-number"] +strip: //p[starts-with(normalize-space(.), '[')] +strip: //p[starts-with(normalize-space(.), '+')] + +test_url: http://www.infoworld.com/article/3053563/it-management/disrupt-or-die-beware-the-siren-calls-of-tech-consultants.html diff --git a/data/GrabberConfig/infzm.com.txt b/data/GrabberConfig/infzm.com.txt new file mode 100644 index 00000000..489d5aff --- /dev/null +++ b/data/GrabberConfig/infzm.com.txt @@ -0,0 +1,9 @@ +# This filter is tested on: +# http://www.infzm.com/content/71068 +# http://www.infzm.com/content/41577 + +author://em[contains(@class, 'toAuthor')] +date:substring(//em[contains(@class, 'pubTime')],1) +body://section[contains(@id, 'articleContent')] +title://h1[contains(@class ,'articleHeadline clearfix')] +test_url: http://www.infzm.com/content/41577
\ No newline at end of file diff --git a/data/GrabberConfig/inhabitat.com.txt b/data/GrabberConfig/inhabitat.com.txt new file mode 100644 index 00000000..c63f53a6 --- /dev/null +++ b/data/GrabberConfig/inhabitat.com.txt @@ -0,0 +1,8 @@ +# set body +body: //div[@class='post-listing'] + +# remove clutter +strip: //a/big +strip: //a/em +strip: //p/em +test_url: http://inhabitat.com/2010/11/18/sliding-walls-transform-this-tokyo-house-into-an-office/
\ No newline at end of file diff --git a/data/GrabberConfig/instagr.am.txt b/data/GrabberConfig/instagr.am.txt new file mode 100644 index 00000000..522caebc --- /dev/null +++ b/data/GrabberConfig/instagr.am.txt @@ -0,0 +1,6 @@ +title: //div[@class='caption'] +author: //p[@class='username'] + +strip: //div[@class='contents']/h3 +strip: //div[@class='location'] +test_url: http://instagr.am/p/G-s_aciyDJ/
\ No newline at end of file diff --git a/data/GrabberConfig/intelligenceonline.fr.txt b/data/GrabberConfig/intelligenceonline.fr.txt new file mode 100644 index 00000000..8e3b2c0e --- /dev/null +++ b/data/GrabberConfig/intelligenceonline.fr.txt @@ -0,0 +1,26 @@ + +# Any modifications done here should be duplicated in +# - lalettrea.fr.txt +# - africaintelligence.fr.txt +# as they seems to use the exact same CMS software as intelligenceonline.fr + +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-chapo ')] | //div[contains(concat(' ',normalize-space(@class),' '),' article-body ')] + +prune: no + +strip_id_or_class: sidenav +strip_id_or_class: sidenav-content +strip_id_or_class: article-copyright + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //form[contains(concat(' ',normalize-space(@class),' '),' form-login ')] +login_uri: https://www.intelligenceonline.fr/ajax/login/login +login_username_field: Identifiant +login_password_field: MotDePasse +login_extra_fields: ConnexionAuto=on +login_extra_fields: OrigineLogin=Landing + +test_url: https://www.intelligenceonline.fr/renseignement-d-etat/2017/05/24/la-gendarmerie-se-met-au-cyber-contre-telegram,108235581-gra diff --git a/data/GrabberConfig/interviewmagazine.com.txt b/data/GrabberConfig/interviewmagazine.com.txt new file mode 100644 index 00000000..a9d4f8ca --- /dev/null +++ b/data/GrabberConfig/interviewmagazine.com.txt @@ -0,0 +1,4 @@ +title: //title +body: //div[contains(@class, 'block')] + +test_url: http://www.interviewmagazine.com/film/spike-jonze
\ No newline at end of file diff --git a/data/GrabberConfig/ipadclub.nl.txt b/data/GrabberConfig/ipadclub.nl.txt new file mode 100644 index 00000000..afe058df --- /dev/null +++ b/data/GrabberConfig/ipadclub.nl.txt @@ -0,0 +1,7 @@ +body: //div[@id = 'post'] +strip: //div[@class = 'postinfo'] +strip: //div[@id = 'postmetanew'] +strip: //div[@class = 'paginator'] +strip: //div[@class = 'col-2'] +strip: //div[@id = 'adfactor-label'] +test_url: http://www.ipadclub.nl/15808/text-writer-ipad-tekstverwerker-met-functieknoppen/
\ No newline at end of file diff --git a/data/GrabberConfig/ipadplanet.nl.txt b/data/GrabberConfig/ipadplanet.nl.txt new file mode 100644 index 00000000..dedb5572 --- /dev/null +++ b/data/GrabberConfig/ipadplanet.nl.txt @@ -0,0 +1,7 @@ +body: //div[@id = 'post'] +strip: //div[@class = 'postinfo'] +strip: //div[@id = 'postmetanew'] +strip: //div[@class = 'paginator'] +strip: //div[@class = 'col-2'] +strip: //div[@id = 'adfactor-label'] +test_url: http://www.ipadplanet.nl/11723/steve-jobs-bevestigt-verdwijnen-fysieke-rotatieschakelaar-in-ios-4-2/
\ No newline at end of file diff --git a/data/GrabberConfig/iphon.fr.txt b/data/GrabberConfig/iphon.fr.txt new file mode 100644 index 00000000..1683ffb8 --- /dev/null +++ b/data/GrabberConfig/iphon.fr.txt @@ -0,0 +1,5 @@ +date: //meta[@name="date"]/@content +author: //meta[@name="author"]/@content +strip_id_or_class: follow + +test_url: http://www.iphon.fr/post/iphone-8-x-recharge-sans-fil-cable-simultanee-891682 diff --git a/data/GrabberConfig/iphoneaddict.fr.txt b/data/GrabberConfig/iphoneaddict.fr.txt new file mode 100644 index 00000000..b0620900 --- /dev/null +++ b/data/GrabberConfig/iphoneaddict.fr.txt @@ -0,0 +1,6 @@ +title: //div[@class='entry-content item']/h1[@class='title fn entry-title'] +author: //div[@class="entry-meta"]/span[@class='reviewer'] +date: //div[@class="entry-meta"]/span[@class='dtreviewed'] +body: //div[@class="post-content description"] + +test_url: http://iphoneaddict.fr/post/news-179987-insolite-b-o-game-of-thrones-interpretee-nouveaux-instruments-chinois-garageband diff --git a/data/GrabberConfig/iphoneclub.nl.txt b/data/GrabberConfig/iphoneclub.nl.txt new file mode 100644 index 00000000..850a24e9 --- /dev/null +++ b/data/GrabberConfig/iphoneclub.nl.txt @@ -0,0 +1,7 @@ +body: //div[@id = 'post'] +strip: //div[@class = 'postinfo'] +strip: //div[@id = 'postmetanew'] +strip: //div[@class = 'paginator'] +strip: //div[@class = 'col-2'] +strip: //div[@id = 'adfactor-label'] +test_url: http://www.iphoneclub.nl/105808/t-mobile-mobiel-internet-wordt-duurder-maar-blijft-onbeperkt/
\ No newline at end of file diff --git a/data/GrabberConfig/iphonehacks.com.txt b/data/GrabberConfig/iphonehacks.com.txt new file mode 100644 index 00000000..e8ccea06 --- /dev/null +++ b/data/GrabberConfig/iphonehacks.com.txt @@ -0,0 +1,9 @@ +title: //meta[@name='og:title']/@content +body: //small[@class='postmetadata'] | //div[contains(@class, 'entry-content')] + +strip: //span[@vanilla-identifier] + +prune: no +tidy: no + +test_url: http://www.iphonehacks.com/2012/07/app-review-process-behind-the-scenes.html
\ No newline at end of file diff --git a/data/GrabberConfig/iphonetweak.fr.txt b/data/GrabberConfig/iphonetweak.fr.txt new file mode 100644 index 00000000..9210e394 --- /dev/null +++ b/data/GrabberConfig/iphonetweak.fr.txt @@ -0,0 +1,4 @@ +title: //div[@class='fond_titre']/h1[@class='post-title'] +body: //div[@class="post-chapo"] + +test_url: http://iphonetweak.fr/2016/05/20/apple-watch-deja-jailbreakee diff --git a/data/GrabberConfig/iplaysoft.com.txt b/data/GrabberConfig/iplaysoft.com.txt new file mode 100644 index 00000000..4a944768 --- /dev/null +++ b/data/GrabberConfig/iplaysoft.com.txt @@ -0,0 +1,2 @@ +body: //div[@id='content']//div[@class='entry-banner' or @class='entry-content'] +test_url: http://www.iplaysoft.com/webbrowserpassview.html
\ No newline at end of file diff --git a/data/GrabberConfig/iso.500px.com.txt b/data/GrabberConfig/iso.500px.com.txt new file mode 100644 index 00000000..ffa29a1d --- /dev/null +++ b/data/GrabberConfig/iso.500px.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fiso.500px.com%2Fhow-i-post-processed-the-most-successful-image-of-my-career%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] +test_url: https://iso.500px.com/how-i-post-processed-the-most-successful-image-of-my-career/ diff --git a/data/GrabberConfig/isource.com.txt b/data/GrabberConfig/isource.com.txt new file mode 100644 index 00000000..215fdf87 --- /dev/null +++ b/data/GrabberConfig/isource.com.txt @@ -0,0 +1,6 @@ +# Remove social buttons +strip: //div[@id='temp_Content_Right'] + +# Remove duplicate article title +strip: //*[(@class='storytitle')] +test_url: http://isource.com/2010/10/24/swearch-a-cool-iphone-web-app/
\ No newline at end of file diff --git a/data/GrabberConfig/it-connect.fr.txt b/data/GrabberConfig/it-connect.fr.txt new file mode 100644 index 00000000..80c95e68 --- /dev/null +++ b/data/GrabberConfig/it-connect.fr.txt @@ -0,0 +1,9 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.it-connect.fr%2Fdebuter-avec-docker-et-les-containers-sous-debian-8%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post_inner_wrapper ')] + +strip_id_or_class: toc_container + +test_url: http://www.it-connect.fr/debuter-avec-docker-et-les-containers-sous-debian-8/ diff --git a/data/GrabberConfig/itavisen.no.txt b/data/GrabberConfig/itavisen.no.txt new file mode 100644 index 00000000..0fe337ac --- /dev/null +++ b/data/GrabberConfig/itavisen.no.txt @@ -0,0 +1,4 @@ +author: //p[@class = 'byline']//span[@class='author'] + +body: //div[@id = 'story-contents'] +test_url: http://itavisen.no/2015/09/08/norske-nettleverandorer-vil-ikke-blokkere-pirate-bay/ diff --git a/data/GrabberConfig/itmedia.co.jp.txt b/data/GrabberConfig/itmedia.co.jp.txt new file mode 100644 index 00000000..97f00ce8 --- /dev/null +++ b/data/GrabberConfig/itmedia.co.jp.txt @@ -0,0 +1,8 @@ +body: //div[@id='cmsBody'] + +next_page_link: //span[@id='next']/a + +strip_id_or_class: cmsCopyright +strip_id_or_class: masterSocialbuttonBtm + +test_url: http://www.itmedia.co.jp/enterprise/articles/0912/05/news002.html
\ No newline at end of file diff --git a/data/GrabberConfig/itnews.com.au.txt b/data/GrabberConfig/itnews.com.au.txt new file mode 100644 index 00000000..f1ff4c38 --- /dev/null +++ b/data/GrabberConfig/itnews.com.au.txt @@ -0,0 +1,7 @@ +title: //h1[@class='article-header'] +body: //div[@class='body-content'] +author: //span[@class='author-byline']/a[contains(@id, 'Author')] + +strip: //span[contains(@id, 'Article_SourceLabel')] + +test_url: http://www.itnews.com.au/feature/the-untold-story-of-iinet-and-internodes-basslink-internet-woes-417296 diff --git a/data/GrabberConfig/itsfoss.com.txt b/data/GrabberConfig/itsfoss.com.txt new file mode 100644 index 00000000..c698678e --- /dev/null +++ b/data/GrabberConfig/itsfoss.com.txt @@ -0,0 +1,6 @@ +title: //h1 +body: //article[contains(@class,'type-post')]/div[@itemprop='text'] +strip: //a[@rel='dofollow'] +strip: //div[contains(@class,'zem_rp_wrap')] +strip: //div[@id='shr_canvas2'] +test_url: https://itsfoss.com/fix-gvfsd-smb-high-cpu-ubuntu/ diff --git a/data/GrabberConfig/itstactical.com.txt b/data/GrabberConfig/itstactical.com.txt new file mode 100644 index 00000000..b8cb461c --- /dev/null +++ b/data/GrabberConfig/itstactical.com.txt @@ -0,0 +1,12 @@ +title: //h1[@class="entry-title"] +body: //div[@class='format_text entry-content'] +author: //span[@class="author vcard"]/a +date: //abbr[@class="published"] + +strip_id_or_class: related-posts +strip_id_or_class: membershipbox +strip_id_or_class: share_this_compact_bt + + +footnotes: no +test_url: http://www.itstactical.com/warcom/knives/exclusive-triple-aught-design-production-dauntless-knife-video-walkthrough/
\ No newline at end of file diff --git a/data/GrabberConfig/itunes.apple.com.txt b/data/GrabberConfig/itunes.apple.com.txt new file mode 100644 index 00000000..ffd95561 --- /dev/null +++ b/data/GrabberConfig/itunes.apple.com.txt @@ -0,0 +1,14 @@ +body: //div[@id='left-stack' or contains(@class, 'center-stack')] + +find_string: class="artwork" src=" +replace_string: class="artwork" src-disabled=" +find_string: src-swap-high-dpi=" +replace_string: src=" + +strip_id_or_class: rating +strip_id_or_class: listeners-also-bought + +prune: no + +test_url: https://itunes.apple.com/us/rss/topaudiobooks/limit=10/xml +test_url: https://itunes.apple.com/us/audiobook/the-giver-unabridged/id356345850
\ No newline at end of file diff --git a/data/GrabberConfig/itwire.com.txt b/data/GrabberConfig/itwire.com.txt new file mode 100644 index 00000000..72b41065 --- /dev/null +++ b/data/GrabberConfig/itwire.com.txt @@ -0,0 +1,5 @@ +author: //a[@rel="author"] +date: //li[@class="itemDateCreated"] +strip: //div[contains(@class, 'legend-rounded')] + +test_url: http://www.itwire.com/it-industry-news/market/59661-ibm-looks-to-high-value-solutions-to-meet-changing-demands diff --git a/data/GrabberConfig/itworld.com.txt b/data/GrabberConfig/itworld.com.txt new file mode 100644 index 00000000..b96ad338 --- /dev/null +++ b/data/GrabberConfig/itworld.com.txt @@ -0,0 +1,19 @@ +# All sites of the IDG network can be extracted using the same rules, +# make sure to update all of them + +author: //meta[@name="author"]/@content +date: //meta[@name="DC.date.issued"]/@content + +body: //div[@itemprop="articleBody"] +body: //div[@itemprop="reviewBody"] +body: //figcaption|//div[@class="img-wrapper"]/noscript/img + +next_page_link: //a[@rel="next"] + +strip: //aside +strip: //h3[contains(., "See also:")] +strip: //div[@id="article-top-page-number"] +strip: //p[starts-with(normalize-space(.), '[')] +strip: //p[starts-with(normalize-space(.), '+')] + +test_url: http://www.itworld.com/article/3055304/your-face-is-big-data-the-title-of-this-photographers-experiment-says-it-all.html diff --git a/data/GrabberConfig/izismile.com.txt b/data/GrabberConfig/izismile.com.txt new file mode 100644 index 00000000..b0114d35 --- /dev/null +++ b/data/GrabberConfig/izismile.com.txt @@ -0,0 +1,4 @@ +body: //div[starts-with(@id, 'news-id-')] +prune: no + +test_url: http://izismile.com/2011/06/13/uncanny_factoid_fashion_or_creepy_2_pics.html
\ No newline at end of file diff --git a/data/GrabberConfig/jalopnik.com.txt b/data/GrabberConfig/jalopnik.com.txt new file mode 100644 index 00000000..7823dbd7 --- /dev/null +++ b/data/GrabberConfig/jalopnik.com.txt @@ -0,0 +1,5 @@ +author: //span[@class='plus-icon'] + +http_header(user-agent): PHP/5.3 + +test_url: http://jalopnik.com/5892124/1955-porsche-550-spyder-sells-for-record-3685-million/
\ No newline at end of file diff --git a/data/GrabberConfig/jameslandrith.com.txt b/data/GrabberConfig/jameslandrith.com.txt new file mode 100644 index 00000000..9437771d --- /dev/null +++ b/data/GrabberConfig/jameslandrith.com.txt @@ -0,0 +1,18 @@ +title: //div[@class='blog']/ul[@class='single']/li//h1 +body: //div[@class='blog']/ul[@class='single']/li +strip: //h1 +strip: //*[@class='post-meta'] +strip: //div[@class='apss-social-share'] +strip: //div[@class='tags'] +strip: //*[@class='nav-single'] + +strip_id_or_class: post_meta +strip_id_or_class: social-share +strip: //nav + +# strip_comments:yes + +prune:no +tidy:yes + +test_url: http://jameslandrith.com/2016/03/20/so-turnabout-is-not-actual-advocacy-or-how-to-be-a-shitty-person-101/ diff --git a/data/GrabberConfig/jandan.net.txt b/data/GrabberConfig/jandan.net.txt new file mode 100644 index 00000000..343fd6fb --- /dev/null +++ b/data/GrabberConfig/jandan.net.txt @@ -0,0 +1,6 @@ +body: //div[@id='content']//div[@class = 'post f'] +strip_id_or_class: comment-big +strip_id_or_class: avatar +strip: //div[@class='time_s'] + +test_url: http://jandan.net/2011/04/03/iphone-5-sony.html
\ No newline at end of file diff --git a/data/GrabberConfig/japoninfos.com.txt b/data/GrabberConfig/japoninfos.com.txt new file mode 100644 index 00000000..31d17dad --- /dev/null +++ b/data/GrabberConfig/japoninfos.com.txt @@ -0,0 +1,30 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' td-post-content ')] + +title: //div[contains(concat(' ',normalize-space(@class),' '),' entry-title ')] +title: //h1[contains(concat(' ',normalize-space(@class),' '),' entry-title ')] + +author: //div[contains(concat(' ',normalize-space(@class),' '),' td-author-name ')]//a +author: //div[contains(concat(' ',normalize-space(@class),' '),' author ')]//a + +strip_id_or_class: vc_message_box-icon +strip_id_or_class: mepr-unauthorized-message +strip_id_or_class: mepr-login-form-wrap + +test_url: https://www.japoninfos.com/ces-marques-que-lon-croit-japonaises-mais-qui-ne-le-sont-pas.html + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' mepr-unauthorized-message ')] +login_uri: https://www.japoninfos.com/login +login_username_field: log +login_password_field: pwd +login_extra_fields: mepr_is_login_page=true +login_extra_fields: mepr_process_login_form=true +login_extra_fields: redirect_to=/login/account +login_extra_fields: rememberme=forever +login_extra_fields: wp-submit=Connexion + +test_url: https://www.japoninfos.com/trois-expositions-dans-le-cadre-du-festival-japonismes.html diff --git a/data/GrabberConfig/javaworld.com.txt b/data/GrabberConfig/javaworld.com.txt new file mode 100644 index 00000000..b329309d --- /dev/null +++ b/data/GrabberConfig/javaworld.com.txt @@ -0,0 +1,19 @@ +# All sites of the IDG network can be extracted using the same rules, +# make sure to update all of them + +author: //meta[@name="author"]/@content +date: //meta[@name="DC.date.issued"]/@content + +body: //div[@itemprop="articleBody"] +body: //div[@itemprop="reviewBody"] +body: //figcaption|//div[@class="img-wrapper"]/noscript/img + +next_page_link: //a[@rel="next"] + +strip: //aside +strip: //h3[contains(., "See also:")] +strip: //div[@id="article-top-page-number"] +strip: //p[starts-with(normalize-space(.), '[')] +strip: //p[starts-with(normalize-space(.), '+')] + +test_url: http://www.javaworld.com/article/3168052/security/open-source-users-its-time-for-extreme-vetting.html diff --git a/data/GrabberConfig/jdubuzz.com.txt b/data/GrabberConfig/jdubuzz.com.txt new file mode 100644 index 00000000..5cf7f005 --- /dev/null +++ b/data/GrabberConfig/jdubuzz.com.txt @@ -0,0 +1,13 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http://www.journaldugeek.com/2015/09/09/apple-ipad-pro/ + +date: //meta[@property="og:updated_time"]/@content +next_page_link: //div[@class="post-content"]/div[@class='row pagination']/a[contains(concat(' ',normalize-space(@class),' '),' next ')] + +strip_id_or_class: jdg-recommend +strip_id_or_class: proofreader-bloc + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] +test_url: http://www.jdubuzz.com/2015/09/11/le-meilleur-du-jduzap-cest-maintenant/ +test_url: http://www.jdubuzz.com/tests/les-20-comptes-instagram-des-filles-les-plus-sexy-a-suivre-en-2016/ diff --git a/data/GrabberConfig/je-suis-papa.com.txt b/data/GrabberConfig/je-suis-papa.com.txt new file mode 100644 index 00000000..2ba54698 --- /dev/null +++ b/data/GrabberConfig/je-suis-papa.com.txt @@ -0,0 +1,3 @@ +strip: //noscript + +test_url: http://www.je-suis-papa.com/pandacraft-kit-educatif-creatif-abonnement-12-ans/ diff --git a/data/GrabberConfig/jetzt.de.txt b/data/GrabberConfig/jetzt.de.txt new file mode 100644 index 00000000..3b138f5e --- /dev/null +++ b/data/GrabberConfig/jetzt.de.txt @@ -0,0 +1,16 @@ +title: //header/h2 +strip: //header/h2 + +body: //div[@class='article__header-teaser'] | //div[@class='article__content']//p | //div[@class='article__content']//img | //div[@class='article__content']//a +strip: //ul[contains(@class, 'sharingbar')] +strip: //ol[@class='teaser__widget'] + +author: substring-after(//div[@class='article__header-author'], 'Von ') +author: substring-after(//div[@class='article__header-author'], 'Interview: ') +author: //div[@class='article__header-author'] + +date: //meta[@property='article:modified_time']/@content + +test_url: http://www.jetzt.de/hauptsache-reset/haeftlinge-als-unternehmensgruender +test_url: http://www.jetzt.de/maedchenfrage/maedchen-fragen-jungs-warum-balanciert-ihr-an-roten-ampeln-auf-dem-rad +test_url: http://www.jetzt.de/meine-theorie/das-perfekte-alter-ist-eine-frage-der-perspektive diff --git a/data/GrabberConfig/jetzt.sueddeutsche.de.txt b/data/GrabberConfig/jetzt.sueddeutsche.de.txt new file mode 100644 index 00000000..00e4cf63 --- /dev/null +++ b/data/GrabberConfig/jetzt.sueddeutsche.de.txt @@ -0,0 +1,22 @@ +title: //h1 +author: //p[contains(@class, 'author')]/a +date: //p[contains(@class, 'time')] +body: //div[@class='content']/div[contains(@class, 'text')] + +# prevent "no text" errors on multi-page articles +tidy: no + +# we use a custom next-link detector instead of the print view because +# it's pretty hard to strip out the unwanted parts in the print view +autodetect_next_page: no +next_page_link: //div[contains(@class, 'text')]/div/div[contains(@class, 'paging')]/a[@class='more '] + +strip: //h1 + +strip_id_or_class: meta +strip_id_or_class: author +strip_id_or_class: paging + +# prevent "Report an Error" from being recognized as footnote +footnotes: no +test_url: http://jetzt.sueddeutsche.de/texte/anzeigen/544308/Alles-flicken
\ No newline at end of file diff --git a/data/GrabberConfig/jeuxvideo.com.txt b/data/GrabberConfig/jeuxvideo.com.txt new file mode 100644 index 00000000..bd02e74d --- /dev/null +++ b/data/GrabberConfig/jeuxvideo.com.txt @@ -0,0 +1,10 @@ +prune: no +body: //div[@class='corps-news text-enrichi-default'] +body: //div[@class='corps-article text-enrichi-default'] +body: //div[@class='corps-video text-enrichi-default'] +strip: //div[@class='bloc-contact-auteur'] +strip: //div[@class='liens-avis-lecteur'] + +test_url: http://www.jeuxvideo.com/news/431383/lancement-cosmique-pour-devouring-stars.htm +test_url: http://www.jeuxvideo.com/test/431612/massive-chalice-du-tour-par-tour-medieval-independant.htm +test_url: http://www.jeuxvideo.com/videos/431381/devouring-stars-moisson-d-etoiles.htm diff --git a/data/GrabberConfig/jezebel.com.txt b/data/GrabberConfig/jezebel.com.txt new file mode 100644 index 00000000..68ad4bd9 --- /dev/null +++ b/data/GrabberConfig/jezebel.com.txt @@ -0,0 +1,4 @@ +http_header(user-agent): PHP/5.3 + +test_url: http://jezebel.com/ryan-adams-has-not-been-mad-for-15-years-and-thats-a-pr-1792192941 +test_contains: Adams details an occasion in 2002 diff --git a/data/GrabberConfig/jjahnke.net.txt b/data/GrabberConfig/jjahnke.net.txt new file mode 100644 index 00000000..d45c8899 --- /dev/null +++ b/data/GrabberConfig/jjahnke.net.txt @@ -0,0 +1,4 @@ +body: //div[@class='entry'] +prune: no + +test_url: http://www.jjahnke.net/rundbr87.html#2514
\ No newline at end of file diff --git a/data/GrabberConfig/jneurosci.org.txt b/data/GrabberConfig/jneurosci.org.txt new file mode 100644 index 00000000..0b0515b1 --- /dev/null +++ b/data/GrabberConfig/jneurosci.org.txt @@ -0,0 +1,8 @@ +title: //h1[@id='page-title'] +author: //div[contains(concat(' ',normalize-space(@class),' '),' highwire-citation-jnl-eneuro-styles-article-title-complete-plus ')]//div[contains(concat(' ',normalize-space(@class),' '),' highwire-cite-authors ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' fulltext-view ')] +date: //div[contains(concat(' ',normalize-space(@class),' '),' highwire-citation-jnl-eneuro-styles-article-title-complete-plus ')]//div[contains(concat(' ',normalize-space(@class),' '),' highwire-cite-metadata ')]//span[contains(concat(' ',normalize-space(@class),' '),' highwire-cite-metadata-date ') and (contains(concat(' ',normalize-space(@class),' '),' highwire-cite-metadata '))] +strip: //header[@id='section-header'] +strip: //footer[@id='section-footer'] +prune: yes +test_url: http://www.jneurosci.org/content/22/13/5344.full diff --git a/data/GrabberConfig/jobbank.gc.ca.txt b/data/GrabberConfig/jobbank.gc.ca.txt new file mode 100644 index 00000000..1dbe2072 --- /dev/null +++ b/data/GrabberConfig/jobbank.gc.ca.txt @@ -0,0 +1,5 @@ +body: //div[@id='formatCont_en'] + +prune: no + +test_url: http://www.jobbank.gc.ca/detail-eng.aspx?Source=JobPosting&OrderNum=6397922
\ No newline at end of file diff --git a/data/GrabberConfig/joelonsoftware.com.txt b/data/GrabberConfig/joelonsoftware.com.txt new file mode 100644 index 00000000..241a361f --- /dev/null +++ b/data/GrabberConfig/joelonsoftware.com.txt @@ -0,0 +1,21 @@ +# Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html + +author: substring-after(//div[@class="author"], 'by ') +date: //div[@class="date"] + +## Clean stuff at top ## + +strip: //h1[1] +strip: //h2[1] +strip: //div[@class="date"] +strip: //div[@class="author"] + +## Clean stuff at bottom ## + +strip: //blockquote[@class="textmessage"] +strip: //div[@style="width:500px"]/p[last()] +strip: //div[@style="width:500px"]/p[last()-1] +strip: //div[@style="width:500px"]/h4[last()] +strip: //div[@style="width:500px"]/h4[last()-1] +strip: //div[@style="width:500px"]/div[last()] +test_url: http://www.joelonsoftware.com/items/2011/09/15.html
\ No newline at end of file diff --git a/data/GrabberConfig/johannesbader.ch.txt b/data/GrabberConfig/johannesbader.ch.txt new file mode 100644 index 00000000..9e846e07 --- /dev/null +++ b/data/GrabberConfig/johannesbader.ch.txt @@ -0,0 +1,8 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fjohannesbader.ch%2F2015%2F12%2Fkrakens-two-domain-generation-algorithms%2F + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' post-title ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' main ')]//div[contains(concat(' ',normalize-space(@class),' '),' pad-large ')] + +test_url: https://johannesbader.ch/2015/12/krakens-two-domain-generation-algorithms/ diff --git a/data/GrabberConfig/journaldugamer.com.txt b/data/GrabberConfig/journaldugamer.com.txt new file mode 100644 index 00000000..5d7d10b6 --- /dev/null +++ b/data/GrabberConfig/journaldugamer.com.txt @@ -0,0 +1,12 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http://www.journaldugeek.com/2015/09/09/apple-ipad-pro/ + +date: //meta[@property="og:updated_time"]/@content +next_page_link: //div[@class="post-content"]/div[@class='row pagination']/a[contains(concat(' ',normalize-space(@class),' '),' next ')] + +strip_id_or_class: jdg-recommend +strip_id_or_class: proofreader-bloc + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] +test_url: http://www.journaldugamer.com/2015/09/14/financier-desormais-tete-nintendo/ diff --git a/data/GrabberConfig/journaldugeek.com.txt b/data/GrabberConfig/journaldugeek.com.txt new file mode 100644 index 00000000..a63f186e --- /dev/null +++ b/data/GrabberConfig/journaldugeek.com.txt @@ -0,0 +1,15 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http://www.journaldugeek.com/2015/09/09/apple-ipad-pro/ + +date: //meta[@property="og:updated_time"]/@content +next_page_link: //div[@class="post-content"]/div[@class='row pagination']/a[contains(concat(' ',normalize-space(@class),' '),' next ')] + +body: //div[class='post-content'] + +strip_id_or_class: jdg-recommend +strip_id_or_class: proofreader-bloc +strip_id_or_class: tests-push + +test_url: http://www.journaldugeek.com/2015/09/09/apple-ipad-pro/ +test_url: http://www.journaldugeek.com/tests/escape-game-maitres-jeu-vous-influencent-faire-gagner/ diff --git a/data/GrabberConfig/joystiq.com.txt b/data/GrabberConfig/joystiq.com.txt new file mode 100644 index 00000000..7a8e56f8 --- /dev/null +++ b/data/GrabberConfig/joystiq.com.txt @@ -0,0 +1,8 @@ +author: //a[@class="byline-author"] +title: //h1[@class="headline"] +strip: //div[@id="info-card"] +strip: //div[@id="breaking-news"] +strip: //div[@class="rmod list-post-mod"] +strip: //div[@id="footer"] +strip: //div[@id="GH_strip"] +test_url: http://www.joystiq.com/2012/06/20/magic-the-gathering-duels-of-the-planeswalkers-2013-review/
\ No newline at end of file diff --git a/data/GrabberConfig/jsforcats.com.txt b/data/GrabberConfig/jsforcats.com.txt new file mode 100644 index 00000000..f2699ef1 --- /dev/null +++ b/data/GrabberConfig/jsforcats.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fjsforcats.com%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] +test_url: http://jsforcats.com/ diff --git a/data/GrabberConfig/juedische-allgemeine.de.txt b/data/GrabberConfig/juedische-allgemeine.de.txt new file mode 100644 index 00000000..ff5a0244 --- /dev/null +++ b/data/GrabberConfig/juedische-allgemeine.de.txt @@ -0,0 +1,19 @@ +body: //div[@id='article_container'] +author: //h4//a[@class='author'] +title: //h1 + +replace_string(lang="en"): lang="de" +replace_string(/>1</a>):/></a> + +strip_id_or_class: share_toolbox +strip_id_or_class: article_header +strip_id_or_class: phototext + +strip_image_src: icon_author.gif + +strip: //img[@src=''] +strip: //h4[@id='author'] + +prune: no + +test_url: http://www.juedische-allgemeine.de/article/view/id/13366
\ No newline at end of file diff --git a/data/GrabberConfig/jungle-world.com.txt b/data/GrabberConfig/jungle-world.com.txt new file mode 100644 index 00000000..61e0087f --- /dev/null +++ b/data/GrabberConfig/jungle-world.com.txt @@ -0,0 +1,3 @@ +title: //h1 +body: //div[contains(@class,'story')] +test_url: http://jungle-world.com/artikel/2015/02/51207.html diff --git a/data/GrabberConfig/juppy.org.txt b/data/GrabberConfig/juppy.org.txt new file mode 100644 index 00000000..fdf7cdc9 --- /dev/null +++ b/data/GrabberConfig/juppy.org.txt @@ -0,0 +1,8 @@ +convert_double_br_tags: yes + +title: //div[@id="storycredits"]/p/span[@class="title"] +author: //div[@id="storycredits"]/p/br[1]/following-sibling::text() + +strip: //div[@id="storycredits"] + +test_url: http://www.juppy.org/santa/stories.php?ForAuthorID=35&Year=2005
\ No newline at end of file diff --git a/data/GrabberConfig/kachestvo.ru.txt b/data/GrabberConfig/kachestvo.ru.txt new file mode 100644 index 00000000..535693c4 --- /dev/null +++ b/data/GrabberConfig/kachestvo.ru.txt @@ -0,0 +1,3 @@ +body: //div[contains(@class, 'inner_content')] + +test_url: http://kachestvo.ru/promtovar/odezhda/denim.html
\ No newline at end of file diff --git a/data/GrabberConfig/kathimerini.gr.txt b/data/GrabberConfig/kathimerini.gr.txt new file mode 100644 index 00000000..2c7c518c --- /dev/null +++ b/data/GrabberConfig/kathimerini.gr.txt @@ -0,0 +1,4 @@ +title: //td[contains(@class, 'articleTitlos')] +body: //td[contains(@class, 'eelantext')] + +test_url: http://www.kathimerini.gr/4dcgi/_w_articles_kathremote_1_03/12/2013_530490
\ No newline at end of file diff --git a/data/GrabberConfig/kattascha.de.txt b/data/GrabberConfig/kattascha.de.txt new file mode 100644 index 00000000..ca8322b0 --- /dev/null +++ b/data/GrabberConfig/kattascha.de.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fkattascha.de%2F%3Fp%3D2207 + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: http://kattascha.de/?p=2207 diff --git a/data/GrabberConfig/kenrockwell.com.txt b/data/GrabberConfig/kenrockwell.com.txt new file mode 100644 index 00000000..90c64cbf --- /dev/null +++ b/data/GrabberConfig/kenrockwell.com.txt @@ -0,0 +1,7 @@ +# Ads +strip: //table[@align="right"][@width="120"] + +# Affiliate link paragraphs +strip: //a[.="Adorama"]/parent::p[contains(., "goodies")] +strip: //a[.="Adorama"]/parent::p[contains(., "This free website's biggest source of")] +test_url: http://www.kenrockwell.com/tech/composition.htm
\ No newline at end of file diff --git a/data/GrabberConfig/keyboardmag.com.txt b/data/GrabberConfig/keyboardmag.com.txt new file mode 100644 index 00000000..4953d81d --- /dev/null +++ b/data/GrabberConfig/keyboardmag.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='article-inner'] +strip_id_or_class: date +author: //div[@class='author'] +test_url: http://www.keyboardmag.com/gear/1183/review-dave-smith-instruments-ob-6/58802 diff --git a/data/GrabberConfig/kicker.de.txt b/data/GrabberConfig/kicker.de.txt new file mode 100644 index 00000000..db4f63c4 --- /dev/null +++ b/data/GrabberConfig/kicker.de.txt @@ -0,0 +1,21 @@ +# set body +body: //div[@id='ovArtikel'] + +# set title +title: //div[@id='ovArtikel']/h1 +# strip main title and leave sub title +strip: //div[@id='ovArtikel']/h1 + +date: //div[@class='publicdate'] + +#remove captions +strip: //*/div[@class='bu'] +strip: //*/div[@class='credit'] + +#remove adds +strip: //*/div[@class='ad-head'] +strip: //*/div[@class='linksebay'] + +# remove video content +strip: //*/div[@class='ovVideo'] +test_url: http://www.kicker.de/news/fussball/frauen/wmfr/frauen-weltmeisterschaft/2011/3/1123662/spielbericht_frankreich-frauen_deutschland-frauen.html
\ No newline at end of file diff --git a/data/GrabberConfig/kickstarter.com.txt b/data/GrabberConfig/kickstarter.com.txt new file mode 100644 index 00000000..7b3daa58 --- /dev/null +++ b/data/GrabberConfig/kickstarter.com.txt @@ -0,0 +1,7 @@ +title: //h1[@id='name'] +body: //*[@id='leftcol'] + +strip_id_or_class: 'share-box' +strip_id_or_class: 'project-faqs' +strip_id_or_class: 'report-issue-wrap' +test_url: http://www.kickstarter.com/projects/hop/elevation-dock-the-best-dock-for-iphone
\ No newline at end of file diff --git a/data/GrabberConfig/kingarthurflour.com.txt b/data/GrabberConfig/kingarthurflour.com.txt new file mode 100644 index 00000000..b27539f5 --- /dev/null +++ b/data/GrabberConfig/kingarthurflour.com.txt @@ -0,0 +1,4 @@ +title: //div[@class='post']/h2 +body: //div[@class='entry'] +strip: //p[contains(.,'Tags:')] +test_url: http://www.kingarthurflour.com/blog/2011/01/28/a-big-sandwich-for-the-big-game/
\ No newline at end of file diff --git a/data/GrabberConfig/kingz.fr.txt b/data/GrabberConfig/kingz.fr.txt new file mode 100644 index 00000000..ba38c116 --- /dev/null +++ b/data/GrabberConfig/kingz.fr.txt @@ -0,0 +1,11 @@ + +author: //meta[@itemprop="author"]/@content + +body: //div[contains(concat(' ',normalize-space(@class),' '),' gp-entry-content ')] + +strip_id_or_class: gp-entry-meta +strip_id_or_class: gp-entry-header +strip_id_or_class: gp-post-navigation +strip_id_or_class: mashsb-container + +test_url: http://www.kingz.fr/trois-nouvelles-fresques-berges-de-seine/ diff --git a/data/GrabberConfig/korben.info.txt b/data/GrabberConfig/korben.info.txt new file mode 100644 index 00000000..8c99755b --- /dev/null +++ b/data/GrabberConfig/korben.info.txt @@ -0,0 +1,6 @@ +author: //meta[@name="author"]/@content +body: //div[@class="entry-content"] +strip: //div[@class="stagi-sous-article-rotation"] +strip_id_or_class: korben-sous-article + +test_url: https://korben.info/gitcoin-aider-a-financer-les-developpements-open-source-avec-de-lethereum.html diff --git a/data/GrabberConfig/kotaku.com.txt b/data/GrabberConfig/kotaku.com.txt new file mode 100644 index 00000000..116ceefa --- /dev/null +++ b/data/GrabberConfig/kotaku.com.txt @@ -0,0 +1,8 @@ +title: //h1[contains(concat(' ',normalize-space(@class),' '),' headline ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] +author: //div[contains(concat(' ',normalize-space(@class),' '),' meta__byline ')] +date: //a[contains(concat(' ',normalize-space(@class),' '),' js_entry-link.js_publish_time ')] + +strip: //div[contains(concat(' ',normalize-space(@class),' '),' ad-mobile ')] + +test_url: https://kotaku.com/the-last-of-us-part-ii-s-violence-is-designed-to-be-rep-1826781044 diff --git a/data/GrabberConfig/kottke.org.txt b/data/GrabberConfig/kottke.org.txt new file mode 100644 index 00000000..582f251c --- /dev/null +++ b/data/GrabberConfig/kottke.org.txt @@ -0,0 +1,6 @@ +title: //h2 +author: //*[@id='main']/div/a[1] +date: substring-before(substring-after(//div[@class='meta'],'•'),'•') +body: //div[@id='main'] +strip: //div[@class='meta'] +test_url: http://kottke.org/08/02/king-of-kong-a-fistful-of-quarters
\ No newline at end of file diff --git a/data/GrabberConfig/kulturegeek.fr.txt b/data/GrabberConfig/kulturegeek.fr.txt new file mode 100644 index 00000000..ce08be39 --- /dev/null +++ b/data/GrabberConfig/kulturegeek.fr.txt @@ -0,0 +1,13 @@ +title: //h1[@class='post-name'] +author: //span[@class="police_info reviewer"]/a +date: //span[@class="police_info dtreviewed"] +body: //div[@class="post-content"] + +strip: //div[@class="yarpp-related"] +strip: //div[@id="pafternews"] +strip: //div[@class="partage_reseaux2"] +strip: //div[@class="partage_reseaux"] +strip: //a[@class="enp-report"] +strip: //div[@class="adsPost"] + +test_url: http://kulturegeek.fr/news-84253/plan-promos-high-tech-week-end-13 diff --git a/data/GrabberConfig/kumailplus.com.txt b/data/GrabberConfig/kumailplus.com.txt new file mode 100644 index 00000000..2f604de0 --- /dev/null +++ b/data/GrabberConfig/kumailplus.com.txt @@ -0,0 +1,3 @@ +body: //div[@class = "entry-full"] + +test_url: http://www.kumailplus.com/2011/12/02/24308
\ No newline at end of file diff --git a/data/GrabberConfig/kumb.com.txt b/data/GrabberConfig/kumb.com.txt new file mode 100644 index 00000000..fe350622 --- /dev/null +++ b/data/GrabberConfig/kumb.com.txt @@ -0,0 +1,10 @@ +title: //div[@id='centrediv']/h1 + +author: substring-after(//div[@id='centrediv']/h3,'By: ') + +date: substring-after(substring-before(//div[@id='centrediv']/h3,'By: '),'Filed: ') + +body: //div[@class='KonaBody'] + +convert_double_br_tags: yes +test_url: http://www.kumb.com/story.php?id=126084
\ No newline at end of file diff --git a/data/GrabberConfig/kwerfeldein.de.txt b/data/GrabberConfig/kwerfeldein.de.txt new file mode 100644 index 00000000..cf4d3b8c --- /dev/null +++ b/data/GrabberConfig/kwerfeldein.de.txt @@ -0,0 +1,9 @@ +date: //span[@class='datum'] +title: //div[@class='artikel']/h2 +body: //div[@class='entry'] +strip: //p[@class='tags'] +author: substring-after(//div[@class='authorinfo']/em,'Dies ist ein Artikel von ') +strip: //div[@class='authorinfo'] +strip: //div[@class='authorpic'] + +test_url: http://kwerfeldein.de/index.php/2011/10/17/doppelbelichtungen-mit-konzept/
\ No newline at end of file diff --git a/data/GrabberConfig/labs.mwrinfosecurity.com.txt b/data/GrabberConfig/labs.mwrinfosecurity.com.txt new file mode 100644 index 00000000..d112fdd7 --- /dev/null +++ b/data/GrabberConfig/labs.mwrinfosecurity.com.txt @@ -0,0 +1,4 @@ +# Title is not detected automatically +title: //section[contains(@class, 'section-intro')]//h1 + +test_url: https://labs.mwrinfosecurity.com/blog/hp-nonstop-basics/ diff --git a/data/GrabberConfig/lalettrea.fr.txt b/data/GrabberConfig/lalettrea.fr.txt new file mode 100644 index 00000000..ca346e3b --- /dev/null +++ b/data/GrabberConfig/lalettrea.fr.txt @@ -0,0 +1,26 @@ + +# Any modifications done here should be duplicated in +# - africaintelligence.fr.txt +# - intelligenceonline.fr.txt +# as they seems to use the exact same CMS software as lalettrea.fr + +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-chapo ')] | //div[contains(concat(' ',normalize-space(@class),' '),' article-body ')] + +prune: no + +strip_id_or_class: sidenav +strip_id_or_class: sidenav-content +strip_id_or_class: article-copyright + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //form[contains(concat(' ',normalize-space(@class),' '),' form-login ')] +login_uri: https://www.lalettrea.fr/ajax/login/login +login_username_field: Identifiant +login_password_field: MotDePasse +login_extra_fields: ConnexionAuto=on +login_extra_fields: OrigineLogin=Landing + +test_url: https://www.lalettrea.fr/entreprises_conseil-et-services/2018/06/26/bearingpoint--les-associes-mal-notes-et-les-autres,108314724-gra diff --git a/data/GrabberConfig/lalibre.be.txt b/data/GrabberConfig/lalibre.be.txt new file mode 100644 index 00000000..14e467c9 --- /dev/null +++ b/data/GrabberConfig/lalibre.be.txt @@ -0,0 +1,9 @@ +title: //h1[@class='mainContentTitle'] +date: //span[@class='publication']/time/@datetime + +body: //div[@class='articleText'] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.lalibre.be/actu/international/emmanuelle-mignon-l-ex-directrice-de-cabinet-de-sarkozy-mise-en-examen-56b10c9d3570b1fc10e24f20 diff --git a/data/GrabberConfig/landetsfria.se.txt b/data/GrabberConfig/landetsfria.se.txt new file mode 100644 index 00000000..e5317a5a --- /dev/null +++ b/data/GrabberConfig/landetsfria.se.txt @@ -0,0 +1,7 @@ +body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] +author: //article//div[contains(@class, 'field-byline')] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.landetsfria.se/artikel/112070
\ No newline at end of file diff --git a/data/GrabberConfig/lapin-blanc.blogs.docteo.net.txt b/data/GrabberConfig/lapin-blanc.blogs.docteo.net.txt new file mode 100644 index 00000000..38835a90 --- /dev/null +++ b/data/GrabberConfig/lapin-blanc.blogs.docteo.net.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Flapin-blanc.blogs.docteo.net%2F2015%2F01%2F20%2Flencadrement-doctoral-deviendrait-il-un-sujet-dactualite%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: http://lapin-blanc.blogs.docteo.net/2015/01/20/lencadrement-doctoral-deviendrait-il-un-sujet-dactualite/ diff --git a/data/GrabberConfig/lapresse.ca.txt b/data/GrabberConfig/lapresse.ca.txt new file mode 100644 index 00000000..a3d92877 --- /dev/null +++ b/data/GrabberConfig/lapresse.ca.txt @@ -0,0 +1,7 @@ +body: //div[@class='article-page']//p[@class='amorce'] | //div[@class='article-page']//div[contains(@class, 'entry')] +author: //div[@class='infosAuteur'] + +strip: //ul[@class='stories'] + +test_url:http://www.lapresse.ca/actualites/national/201611/30/01-5046565-coup-dur-pour-les-radars-photo-plusieurs-constats-pourraient-etre-annules.php +test_url: http://www.lapresse.ca/le-soleil/vivre-ici/la-science-au-quotidien/201610/01/01-5026482-les-vertus-de-leau-degout.php diff --git a/data/GrabberConfig/laquadrature.net.txt b/data/GrabberConfig/laquadrature.net.txt new file mode 100644 index 00000000..746bfca7 --- /dev/null +++ b/data/GrabberConfig/laquadrature.net.txt @@ -0,0 +1,10 @@ +body: //div[@id='content-content']//div[@class='content'] +title: //h1[@class='title'] +date: substring-after(//*[@class='submitted'],'Submitted on') +tidy: no +strip: //div[@class='terms terms-inline'] +strip: //div[@class='more'] +strip: //div[@class='share-links'] +strip: //table[@id='attachments'] + +test_url: http://www.laquadrature.net/en/finalization-of-eu-parliaments-weak-net-neutrality-resolution
\ No newline at end of file diff --git a/data/GrabberConfig/lareviewofbooks.org.txt b/data/GrabberConfig/lareviewofbooks.org.txt new file mode 100644 index 00000000..25e36543 --- /dev/null +++ b/data/GrabberConfig/lareviewofbooks.org.txt @@ -0,0 +1,12 @@ +#metadata +title: substring-before(//title,' |') +author: //a[contains(@class,'person') and starts-with(@href, '/contributor')] + +#text +body: //div[contains(@class, 'article_body')] + +#clean up +strip_id_or_class: recommended_section + +test_url: http://lareviewofbooks.org/review/american-politics-redeembale-robert-gates-hillary-clinton-two-memoirs-washington-dc +test_url: http://lareviewofbooks.org/interview/souvenirs-future diff --git a/data/GrabberConfig/latimes.com.txt b/data/GrabberConfig/latimes.com.txt new file mode 100644 index 00000000..b2db37bf --- /dev/null +++ b/data/GrabberConfig/latimes.com.txt @@ -0,0 +1,11 @@ +strip: //div[@id="tugs_story_display"] +strip: //div[@id="search_overlay"] +strip: //div[@id="adv_search"] +body: //div[@class='story'] +tidy: no +convert_double_br_tags: yes +single_page_link: //a[contains(@href, ',print.')] +strip: //p[starts-with(., 'latimes.com')] +strip: //h1[starts-with(., 'latimes.com')] +strip_id_or_class: cubead +test_url: http://www.latimes.com/news/opinion/commentary/la-oe-gartonash-wilders-20110512,0,2876761.story
\ No newline at end of file diff --git a/data/GrabberConfig/laughingsquid.com.txt b/data/GrabberConfig/laughingsquid.com.txt new file mode 100644 index 00000000..ab2f834f --- /dev/null +++ b/data/GrabberConfig/laughingsquid.com.txt @@ -0,0 +1,3 @@ +title: //h1[@class='entry-title'] +body: //div[@class='entry-content'] +test_url: http://laughingsquid.com/mysterious-tiny-doors-appearing-around-san-francisco/
\ No newline at end of file diff --git a/data/GrabberConfig/lawfareblog.com.txt b/data/GrabberConfig/lawfareblog.com.txt new file mode 100644 index 00000000..49d858a7 --- /dev/null +++ b/data/GrabberConfig/lawfareblog.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='field-items'] +title: //div[@class='title'] + +test_url: https://lawfareblog.com/limits-panopticon diff --git a/data/GrabberConfig/leancrew.com.txt b/data/GrabberConfig/leancrew.com.txt new file mode 100644 index 00000000..e78cf7e6 --- /dev/null +++ b/data/GrabberConfig/leancrew.com.txt @@ -0,0 +1,9 @@ +title: //div[@id="content"]/h1[1] +date: substring-before(//p[@class="postdate"], ' at ') +author: ("Dr. Drang") + +strip: //div[@id="content"]/h1[1] +strip: //p[@class="postdate"] +strip: //h2[@id="respond"] +strip: //blockquote[@class="bbpTweet"]/p/span/a/img +test_url: http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
\ No newline at end of file diff --git a/data/GrabberConfig/leblogduhacker.fr.txt b/data/GrabberConfig/leblogduhacker.fr.txt new file mode 100644 index 00000000..7fb215c1 --- /dev/null +++ b/data/GrabberConfig/leblogduhacker.fr.txt @@ -0,0 +1,10 @@ +title: //meta[@name='og:title']/@content +body: //main[@class='l-content']/section[@class='l-section']/div + +strip: //div[@id='jp-relatedposts'] + +prune: no +tidy: no + +test_url: https://www.leblogduhacker.fr/pourquoi-est-il-important-detre-discret-sur-internet/ + diff --git a/data/GrabberConfig/lececil.org.txt b/data/GrabberConfig/lececil.org.txt new file mode 100644 index 00000000..8a38bbf3 --- /dev/null +++ b/data/GrabberConfig/lececil.org.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.lececil.org%2Fnode%2F7688 + +body: //div[contains(concat(' ',normalize-space(@class),' '),' field-body ')] +test_url: http://www.lececil.org/node/7688 diff --git a/data/GrabberConfig/lecker.de.txt b/data/GrabberConfig/lecker.de.txt new file mode 100644 index 00000000..b1097811 --- /dev/null +++ b/data/GrabberConfig/lecker.de.txt @@ -0,0 +1,23 @@ +# author: kreativmonkey + +# Article information +title: //h1 + + +# Content +body: //article[contains(@class, 'recipe')] + + +# Cleanup +strip_id_or_class: social-bar +strip_id_or_class: social-bar-sticky +strip_id_or_class: dikr-teaser-native +strip_id_or_class: list--tags +strip_id_or_class: nativendo-artikel +strip_id_or_class: adverteaser +strip_id_or_class: taboolaWrapper +strip_id_or_class: dikr-responsive-ads-slot +strip_id_or_class: article-header__copyright + +test_url: http://www.lecker.de/lachstatar-mit-honig-limetten-dressing-64756.html +test_content: entkernen und fein würfeln diff --git a/data/GrabberConfig/ledoc-info.com.txt b/data/GrabberConfig/ledoc-info.com.txt new file mode 100644 index 00000000..4375250e --- /dev/null +++ b/data/GrabberConfig/ledoc-info.com.txt @@ -0,0 +1,22 @@ + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' entry-title ')] + +body: //div[contains(concat(' ',normalize-space(@class),' '),' single-thumb ')] | //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] + +strip_id_or_class: message_pms +strip_id_or_class: td_login_message +strip_id_or_class: td_pay_message +strip_id_or_class: dkpdf-button-container + +test_url: https://ledoc-info.com/2018/07/01/photodoc-lallee-tourisme-passage-plus-glauque-de-montpellier/ + +# Wallabag-specific login directives (not supported in FTR): +requires_login: yes +login_uri: https://ledoc-info.com/wp-login.php +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' login_message ')] +login_username_field: log +login_password_field: pwd +login_extra_fields: rememberme=forever +login_extra_fields: wp-submit=Se connecter + +test_url: https://ledoc-info.com/2018/02/08/grande-motte-projets-divisent/ diff --git a/data/GrabberConfig/lefigaro.fr.txt b/data/GrabberConfig/lefigaro.fr.txt new file mode 100644 index 00000000..7e1d12d7 --- /dev/null +++ b/data/GrabberConfig/lefigaro.fr.txt @@ -0,0 +1,9 @@ +title: //meta[@name='title']/@content +author: //span[@class='sign']//a[@class='journaliste'] +author: //meta[@name='author']/@content +body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte'] +date: //li[contains(concat(' ',normalize-space(@class),' '),' fig-date-pub ')]//time +prune: no +test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php +test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php +test_url: http://www.lefigaro.fr/social/2015/03/10/09010-20150310ARTFIG00312-encore-une-annee-noire-pour-l-emploi-salarie.php diff --git a/data/GrabberConfig/lefilrouge.media.txt b/data/GrabberConfig/lefilrouge.media.txt new file mode 100644 index 00000000..d5f3ab13 --- /dev/null +++ b/data/GrabberConfig/lefilrouge.media.txt @@ -0,0 +1,4 @@ + +strip_id_or_class: the_champ_sharing_container + +test_url: https://lefilrouge.media/etats-unis-nevada-burning-man-festival/ diff --git a/data/GrabberConfig/lejournal.cnrs.fr.txt b/data/GrabberConfig/lejournal.cnrs.fr.txt new file mode 100644 index 00000000..a995d619 --- /dev/null +++ b/data/GrabberConfig/lejournal.cnrs.fr.txt @@ -0,0 +1,11 @@ +single_page_link: //a[@class='print-page'] + +title: //h1[@class='node-title'] + +body: //div[@class='article-contenu'] + +# Strip duplicated pictures +strip_id_or_class: white-popup +strip_id_or_class: definition + +test_url: https://lejournal.cnrs.fr/articles/la-ceramique-sert-a-tout diff --git a/data/GrabberConfig/lemonde.fr.txt b/data/GrabberConfig/lemonde.fr.txt new file mode 100644 index 00000000..43ee40d9 --- /dev/null +++ b/data/GrabberConfig/lemonde.fr.txt @@ -0,0 +1,60 @@ +title: //h1[@class='article__title'] + +# We can have multiple authors +author: //span[contains(concat(' ',normalize-space(@class),' '),' author__name ')] + +# Last edition date (if any) +date: //time[@itemprop='dateModified']/@datetime +# Publication date +date: //time[@itemprop='datePublished']/@datetime + +body: //section[@class='article__content'] + +# Another body selector and strip for video-only links +body: //section[contains(concat(' ',normalize-space(@class), ' '), ' video ')] +strip: //div[contains(concat(' ',normalize-space(@class), ' '), ' related-content--video ')] + +# Remove "Lire aussi" blocks +strip: //section[contains(concat(' ',normalize-space(@class),' '),' catcher ')] + +# Remove "Lire aussi" paragraphs (just containing "Lire" in strong and a link) +strip: //p[contains(strong, 'Lire') and a] + +# Remove comments +strip: //*[contains(@class, 'comments')] + +# Remove "Article réservé aux abonnés" +strip: //p[@class='article__status'] + +# Remove quotes highlighted in articles, doublons with content +# We use parent::blockquote to avoid a remaining empty blockquote node +strip: //p[@class='article__quote']/parent::blockquote + +# Remove share buttons +strip://ul[contains(@class, 'meta__social')] + +# Remove the insane "conjugaison.lemonde.fr" links: +find_string: <a target='_blank' onclick='return false;' class='lien_interne conjug' +replace_string: <input type='hidden' style='display:none;' + +# Remove the insane cross-linking categories "Toute l’actualité" +find_string: <a class="lien_interne rub" +replace_string: <input type="hidden" style="display:none;" + +requires_login: yes + +login_uri: https://secure.lemonde.fr/sfuser/connexion +login_username_field: connection[mail] +login_password_field: connection[password] + +login_extra_fields: connection[_token]=@=xpath("//form//input[@id='connection__token']", request_html(config.getLoginUri())) + +not_logged_in_xpath: //section[contains(concat(' ',normalize-space(@class),' '),' paywall__container ')]//p[@class='paywall__login'] + + +prune: no + +test_url: http://www.lemonde.fr/economie/article/2011/07/05/moody-s-abaisse-la-note-du-portugal-de-quatre-crans_1545237_3234.html +test_url: http://www.lemonde.fr/big-browser/article/2017/10/27/assassinat-de-kennedy-ce-qu-on-a-appris-dans-les-documents-declassifies_5207029_4832693.html +test_url: https://www.lemonde.fr/pixels/article/2018/07/14/douze-jeux-video-pour-s-amuser-a-plusieurs_5331269_4408996.html +test_url: https://www.lemonde.fr/sante/video/2016/04/07/diabete-pourquoi-une-telle-progression-de-l-epidemie_4898147_1651302.html diff --git a/data/GrabberConfig/lenta.ru.txt b/data/GrabberConfig/lenta.ru.txt new file mode 100644 index 00000000..c2814794 --- /dev/null +++ b/data/GrabberConfig/lenta.ru.txt @@ -0,0 +1,9 @@ +body: //div[@itemprop='articleBody'] | //img[@itemprop='image'] +strip_id_or_class: b-inline-topics-box +strip: //iframe[@name='banner'] + +# do not prune embedded videos +prune: no + +test_url: https://lenta.ru/news/2016/07/08/weakpound/ +test_url: https://lenta.ru/news/2017/11/01/bigdata/ diff --git a/data/GrabberConfig/lequatreheures.com.txt b/data/GrabberConfig/lequatreheures.com.txt new file mode 100644 index 00000000..4da7ae4a --- /dev/null +++ b/data/GrabberConfig/lequatreheures.com.txt @@ -0,0 +1,23 @@ + +body: //article[contains(concat(' ',normalize-space(@class),' '),' post ')] + +# the content of the article is actually contained in an <iframe>, +# we use the single_page_link directive to redirect to it: +single_page_link: //div[contains(concat(' ',normalize-space(@class),' '),' content-container ')]//iframe/@data-src-desktop + +tidy: no +prune: no + +strip: //script + +test_url: https://www.lequatreheures.com/episodes/condamnes-a-lair-libre-travaux-interet-general-prison + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //a[@href='https://www.lequatreheures.com/abonnez-vous/'] +login_uri: https://www.lequatreheures.com/login/connect/ +login_username_field: username +login_password_field: password +login_extra_fields: form_valid_uid=@=xpath('//input[@name="form_valid_uid"]', request_html('https://www.lequatreheures.com/connexion/')) diff --git a/data/GrabberConfig/lesnumeriques.com.txt b/data/GrabberConfig/lesnumeriques.com.txt new file mode 100644 index 00000000..51e025ae --- /dev/null +++ b/data/GrabberConfig/lesnumeriques.com.txt @@ -0,0 +1,9 @@ +title: //h1/following::span[@class='fn'] +# Author: should stop parsing until <br> reached, but I don't know how to do this. +author: //following::div[@class='PDate2'] +date: //following::div[@class='PDate2']/strong + +body: //div[@class='ArTexte'] +body: //div[@id='prod_txt_b'] +body: //div[@class='ArPhotoP'] +test_url: http://www.lesnumeriques.com/disque-dur-multimedia/popcorn-hour-300-p12231/test.html
\ No newline at end of file diff --git a/data/GrabberConfig/letraslibres.com.txt b/data/GrabberConfig/letraslibres.com.txt new file mode 100644 index 00000000..cf271bca --- /dev/null +++ b/data/GrabberConfig/letraslibres.com.txt @@ -0,0 +1,3 @@ +single_page_link: concat(link[@rel="canonical"], "?page=full") + +test_url: http://www.letraslibres.com/revista/dossier/quien-manda-en-europa diff --git a/data/GrabberConfig/lezephyrmag.com.txt b/data/GrabberConfig/lezephyrmag.com.txt new file mode 100644 index 00000000..84dc61a5 --- /dev/null +++ b/data/GrabberConfig/lezephyrmag.com.txt @@ -0,0 +1,6 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] + +strip_id_or_class: swp_social_panel + +test_url: https://lezephyrmag.com/libre/paroles/jean-roch-waro-voyager-avec-la-musique-est-un-acte-poetique/ diff --git a/data/GrabberConfig/libcom.org.txt b/data/GrabberConfig/libcom.org.txt new file mode 100644 index 00000000..d1404d10 --- /dev/null +++ b/data/GrabberConfig/libcom.org.txt @@ -0,0 +1,7 @@ +date: //span[contains(@class, 'page-date')] +body: //div[@id='node-page'] +strip_id_or_class: book-navigation +prune: no + +test_url: http://libcom.org/library/what-was-the-ussr-aufheben-1 +test_url: http://libcom.org/library-latest/feed
\ No newline at end of file diff --git a/data/GrabberConfig/liberation.checknews.fr.txt b/data/GrabberConfig/liberation.checknews.fr.txt new file mode 100644 index 00000000..ba6a7048 --- /dev/null +++ b/data/GrabberConfig/liberation.checknews.fr.txt @@ -0,0 +1,3 @@ +title: //div[@class="big"] + +test_url: https://liberation.checknews.fr/question/37211/est-il-vrai-que-la-depense-de-sante-en-france-est-beaucoup-plus-importante-que-dans-les-autres-pays-de-locde diff --git a/data/GrabberConfig/liberation.fr.txt b/data/GrabberConfig/liberation.fr.txt new file mode 100644 index 00000000..36bd7776 --- /dev/null +++ b/data/GrabberConfig/liberation.fr.txt @@ -0,0 +1,7 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' width-padded-left ')] +strip_id_or_class: article-rel-item +author: //div[@class='article-head-metas']/a +date: //span[@class='date']/time + +test_url: http://www.liberation.fr/elections-presidentielle-legislatives-2017/2017/04/14/pas-assez-enthousiastes-ils-se-font-virer-d-un-meeting-de-fillon_1562897 +test_contains: Alors comment expliquer diff --git a/data/GrabberConfig/lifehacker.co.uk.txt b/data/GrabberConfig/lifehacker.co.uk.txt new file mode 100644 index 00000000..c540f7f3 --- /dev/null +++ b/data/GrabberConfig/lifehacker.co.uk.txt @@ -0,0 +1,7 @@ +title: //div[@itemprop='headline'] +body: //noscript/img | //div[@itemprop='text'] +author: //div[@class='meta meta--post']//a[@class='is-author'] +date: //div[@class='meta meta--post']//time/@datetime + +test_url: http://www.lifehacker.co.uk/2014/08/22/dealhacker-10-google-chromecast-super-cheap-batteries-much +test_url: http://www.lifehacker.co.uk/2014/08/18/andrognito-hides-files-youd-like-keep-away-prying-eyes diff --git a/data/GrabberConfig/lifehacker.com.txt b/data/GrabberConfig/lifehacker.com.txt new file mode 100644 index 00000000..330c4e78 --- /dev/null +++ b/data/GrabberConfig/lifehacker.com.txt @@ -0,0 +1,53 @@ +# Adds author text: Gawker sites commonly show as "Author: View Profile" +author://a[@class="plus-icon modfont"] + +# Add date and time +date: //span[@class="date"] + +body: //div[contains(@class, 'marquee-asset-wrapper') or contains(@class, 'post-content')] + +# Remove date and time from article text +strip: //span[@class="date"] + +# Remove login/comment text +strip: //*[(@class="presence_control_external smalltype")] + +strip: //div[@class="nodebyline modfont"] + +# Remove right sidebar +strip: //div[@id="rightwrapper"] + +# Remove print header +strip: //div[@id='printhead']/h1 + +# Remove 'content is restricted' +strip: //div[@id='agegate_IDHERE'] + +# Remove follow text +strip: //*[(@class="permalink_ads")] + +strip_id_or_class: inset_groups + +# Remove view/comment count +strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line'] + +# Remove contact text +strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo'] + +# Remove medium duplicates of the article image +strip_image_src: medium.jpg + +# Remove "arrow" class at bottom of page +strip: //p[@class="arrow"] + +# Remove "track" image from article body +strip: //img[@alt="track"] + +# Remove hidden URLs +strip: //a[@x-inset="hidden"] + +http_header(user-agent): PHP/5.3 + +test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos +test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse +test_url: http://lifehacker.com/what-happens-to-the-brain-when-you-meditate-and-how-it-1202533314 diff --git a/data/GrabberConfig/lifestyle.inquirer.net.txt b/data/GrabberConfig/lifestyle.inquirer.net.txt new file mode 100644 index 00000000..25d544ae --- /dev/null +++ b/data/GrabberConfig/lifestyle.inquirer.net.txt @@ -0,0 +1,7 @@ +title: //h1[@class='singlePageTitle'] + +strip: //p[contains(text(), 'Follow Us')] +strip: //p/strong[contains(text(), 'Recent Stories:')] +strip: //div[@id="sharefeature"] + +test_url: http://lifestyle.inquirer.net/100223/dusting-your-ceiling-fan diff --git a/data/GrabberConfig/lifeweek.com.cn.txt b/data/GrabberConfig/lifeweek.com.cn.txt new file mode 100644 index 00000000..e09f6692 --- /dev/null +++ b/data/GrabberConfig/lifeweek.com.cn.txt @@ -0,0 +1,23 @@ +# This filter is tested on: +# http://www.lifeweek.com.cn/2012/1211/39439.shtml +# http://www.lifeweek.com.cn/2013/0308/40213.shtml + +title:substring-before(//h1, '(') +title://h1 +date://ul[@class='authorbox']/li +author: substring-after(//ul[@class='authorbox']/li/following-sibling::li, '作者:') + +next_page_link: //div[@class='pageturn_list']/a[@class='pagedown'] +body: //div[@class='original '] + +strip://h1 +strip://ul[@class='authorbox'] +strip://span[@class='app_p'] +strip://div[@style='text-align:right;'] +strip://div[@class='pageturn_list'] +strip://div[@class='lifespeaks'] +strip://div[@class='vright fr'] +strip://div[@class='copyrt mg20'] +strip://div[@class='keyabout mg20'] +strip://ul[@class='readabout mg20'] +test_url: http://www.lifeweek.com.cn/2013/0308/40213.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/limprevu.fr.txt b/data/GrabberConfig/limprevu.fr.txt new file mode 100644 index 00000000..431c2310 --- /dev/null +++ b/data/GrabberConfig/limprevu.fr.txt @@ -0,0 +1,16 @@ + +# remark: the web page we obtain will inevitably contains more than one article, +# we keep only the first one: +body: (//div[contains(concat(' ',normalize-space(@class),' '),' article__edito ')])[1] + +author: (//header[contains(concat(' ',normalize-space(@class),' '),' article__header ')])[1]//p[contains(concat(' ',normalize-space(@class),' '),' author-info__content__text ')]/a + +tidy: no +prune: no + +strip: //script +strip_id_or_class: support-block +strip_id_or_class: subscription-box +strip_id_or_class: container_hidden + +test_url: https://limprevu.fr/articles/17-05-2018/pour-embrasser-le-numerique-noubliez-pas-la-langue/ diff --git a/data/GrabberConfig/linkedin.com.txt b/data/GrabberConfig/linkedin.com.txt new file mode 100644 index 00000000..04052e48 --- /dev/null +++ b/data/GrabberConfig/linkedin.com.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.linkedin.com%2Fpulse%2F20140328055547-78273192-how-to-become-seo-expert + +body: //div[contains(concat(' ',normalize-space(@class),' '),' prose ')] +test_url: https://www.linkedin.com/pulse/20140328055547-78273192-how-to-become-seo-expert + diff --git a/data/GrabberConfig/linux-community.de.txt b/data/GrabberConfig/linux-community.de.txt new file mode 100644 index 00000000..50319750 --- /dev/null +++ b/data/GrabberConfig/linux-community.de.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.linux-community.de%2Fausgaben%2Flinuxuser%2F2012%2F08%2Fahnenforschung-mit-der-genealogie-software-gramps%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' td-post-content ')] +next_page_link: //link[contains(@rel, 'next')]/@href +test_url: http://www.linux-community.de/ausgaben/linuxuser/2012/08/ahnenforschung-mit-der-genealogie-software-gramps/ diff --git a/data/GrabberConfig/linuxjournal.com.txt b/data/GrabberConfig/linuxjournal.com.txt new file mode 100644 index 00000000..c5e64463 --- /dev/null +++ b/data/GrabberConfig/linuxjournal.com.txt @@ -0,0 +1,6 @@ +body: //div[@class='content-area'] +next_page_link: //a[@title='Go to next page'] +author: //a[@title='View user profile.'] +strip_id_or_class: comments + +test_url: http://www.linuxjournal.com/content/be-mechanicwith-android-and-linux diff --git a/data/GrabberConfig/linuxnix.com.txt b/data/GrabberConfig/linuxnix.com.txt new file mode 100644 index 00000000..27db4c22 --- /dev/null +++ b/data/GrabberConfig/linuxnix.com.txt @@ -0,0 +1,4 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +strip_id_or_class: ts-fab-wrapper + +test_url: https://www.linuxnix.com/how-to-rotate-logs-manually-in-linux/ diff --git a/data/GrabberConfig/lithub.com.txt b/data/GrabberConfig/lithub.com.txt new file mode 100644 index 00000000..c630c43d --- /dev/null +++ b/data/GrabberConfig/lithub.com.txt @@ -0,0 +1,4 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' post_wrapper_inner ')]//div[contains(concat(' ',normalize-space(@class),' '),' post_inner_wrapper ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' post_tag ')] + +test_url: http://lithub.com/why-indie-presses-are-opening-bookstores/ diff --git a/data/GrabberConfig/livescience.com.txt b/data/GrabberConfig/livescience.com.txt new file mode 100644 index 00000000..5275d34a --- /dev/null +++ b/data/GrabberConfig/livescience.com.txt @@ -0,0 +1,20 @@ +title: //div[@class="album_title"]//h1 +author: substring-before(//div[@class='by_line'], ',') +date: substring-after(substring-before(//div[@class="album_time"], ' Time'), 'Date: ') +body: //div[@class="about_text"] + +strip: //div[@class='large_popper'] +strip: //span[contains(@id, 'mag_glass')] +strip: //span[contains(@class, 'img_overlay')] +strip: //td//span +strip: //div[@class="center_adsense"] +strip: //div[@class="article_info"]//div[@class='asset_section'] +strip: //div[@class="article_additional"] +strip: //div[contains(@style, 'overflow:hidden')] +strip: //div[@class="aa_text"] +strip: //div[@id='nointelliTXT'] + +prune: no +autodetect_on_failure: no + +test_url: http://www.livescience.com/34569-why-flowers-close-at-night-nyctinasty.html diff --git a/data/GrabberConfig/longform.org.txt b/data/GrabberConfig/longform.org.txt new file mode 100644 index 00000000..1310ec0d --- /dev/null +++ b/data/GrabberConfig/longform.org.txt @@ -0,0 +1,3 @@ +single_page_link: //div[@class="post"]/div[@class="title"]/a + +test_url: http://longform.org/2011/05/06/disconcerting-new-answers-in-models-suicide/
\ No newline at end of file diff --git a/data/GrabberConfig/loopinsight.com.txt b/data/GrabberConfig/loopinsight.com.txt new file mode 100644 index 00000000..730af947 --- /dev/null +++ b/data/GrabberConfig/loopinsight.com.txt @@ -0,0 +1,9 @@ +body: //div[@class='container_16']//div[@class='grid_11'] +strip: //h2[@class='mast'] +strip: //div[@class='container_16']//div[@class='grid_11']/h1 +strip: //div[@class='container_16']//div[@class='grid_11']/p[1] +strip: //div[@class='container_16']//div[@class='grid_11']/div +author: //a[starts-with(@title, 'Posts by')] +date: substring-before(substring-after(//time, 'Posted on '), ' at') +test_url: http://www.loopinsight.com/2012/09/13/forget-iphone-5-naysayers-this-thing-is-big/ +test_url: http://www.loopinsight.com/2011/05/20/playbook-returns-high-misses-sales-targets-by-90/
\ No newline at end of file diff --git a/data/GrabberConfig/lostgarden.com.txt b/data/GrabberConfig/lostgarden.com.txt new file mode 100644 index 00000000..d7eb0fa0 --- /dev/null +++ b/data/GrabberConfig/lostgarden.com.txt @@ -0,0 +1,3 @@ +prune: no +convert_double_br_tags: yes +test_url: http://www.lostgarden.com/2012/04/loops-and-arcs.html
\ No newline at end of file diff --git a/data/GrabberConfig/lowtechmagazine.com.txt b/data/GrabberConfig/lowtechmagazine.com.txt new file mode 100644 index 00000000..2588ac5f --- /dev/null +++ b/data/GrabberConfig/lowtechmagazine.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.lowtechmagazine.com%2F2015%2F12%2Freinventing-the-greenhouse.html + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-inner ')] +test_url: http://www.lowtechmagazine.com/2015/12/reinventing-the-greenhouse.html
\ No newline at end of file diff --git a/data/GrabberConfig/lrb.co.uk.txt b/data/GrabberConfig/lrb.co.uk.txt new file mode 100644 index 00000000..0913b3aa --- /dev/null +++ b/data/GrabberConfig/lrb.co.uk.txt @@ -0,0 +1,12 @@ +title: //div[contains(@class, "article-body")]/hgroup/h1 +body: //div[contains(@class, "article-body")] + +date: substring-after(//p[@class="meta-info"]/a, '· ') + +author: //p[@class='intro']/a[contains(@rel, 'author')]/@content + +strip_id_or_class: print-hide +strip_id_or_class: books + +test_url: http://www.lrb.co.uk/v33/n18/james-meek/its-already-happened +test_url: http://www.lrb.co.uk/v36/n13/benjamin-kunkel/paupers-and-richlings diff --git a/data/GrabberConfig/lukew.com.txt b/data/GrabberConfig/lukew.com.txt new file mode 100644 index 00000000..05f83f11 --- /dev/null +++ b/data/GrabberConfig/lukew.com.txt @@ -0,0 +1,3 @@ +title: //h1 + +test_url: https://www.lukew.com/ff/entry.asp?1995 diff --git a/data/GrabberConfig/luminous-landscape.com.txt b/data/GrabberConfig/luminous-landscape.com.txt new file mode 100644 index 00000000..b445f5eb --- /dev/null +++ b/data/GrabberConfig/luminous-landscape.com.txt @@ -0,0 +1,6 @@ +title: //h2 + +body: // div[@id='content'] + +strip: //div[@class='sidebar_wrapper'] +test_url: http://www.luminous-landscape.com/tutorials/optimizing_exposure.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/lupa.cz.txt b/data/GrabberConfig/lupa.cz.txt new file mode 100644 index 00000000..08833d42 --- /dev/null +++ b/data/GrabberConfig/lupa.cz.txt @@ -0,0 +1,3 @@ +strip_id_or_class: promo-in-article + +test_url: http://www.lupa.cz/clanky/michal-zamec-parfums-za-dva-roky-presahneme-obrat-6-miliard-korun/
\ No newline at end of file diff --git a/data/GrabberConfig/luxuo.com.txt b/data/GrabberConfig/luxuo.com.txt new file mode 100644 index 00000000..a3d5cb17 --- /dev/null +++ b/data/GrabberConfig/luxuo.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='post-content'] +prune: no + +test_url: http://www.luxuo.com/watches/feed
\ No newline at end of file diff --git a/data/GrabberConfig/lvsl.fr.txt b/data/GrabberConfig/lvsl.fr.txt new file mode 100644 index 00000000..90a532ea --- /dev/null +++ b/data/GrabberConfig/lvsl.fr.txt @@ -0,0 +1,3 @@ +title: //h1[contains(@class,'entry-title')] +body: //div[contains(@class,'body-content')] +test_url: http://lvsl.fr/peuple-manifestant-saez-a-t-pondu-hexagone diff --git a/data/GrabberConfig/lwn.net.txt b/data/GrabberConfig/lwn.net.txt new file mode 100644 index 00000000..a10cd923 --- /dev/null +++ b/data/GrabberConfig/lwn.net.txt @@ -0,0 +1,49 @@ +# HTML5 anyone? The 1980s called, they want their HTML4 back. +# LWN uses so little markup that you really have to be creative. + +tidy: yes +prune: no + +single_page_link: //div[@class='ArticleText']//a[contains(text(), 'Full Story')]/@href +single_page_link: concat(//div[@class='ArticleText']//a[contains(text(), 'Read more')]/@href, 'bigpage') +if_page_contains: //div[@class='ArticleText']//a[contains(text(), 'Read more')] + +title: //h1 + +# After tiding the document, <b> becomes <strong>. +author: //div[@class='FeatureByline']/strong +date: //div[@class='FeatureByline']/text()[preceding-sibling::br] +strip: //div[@class='FeatureByline'] +author: substring-after(//div[@class='GAByline']/p[2], 'by ') +date: //div[@class='GAByline']/p[1] +strip: //div[@class='GAByline'] + +# tidy will take care of fixing the tag mess that we make here. +replace_string(<p class="Cat1HL">): <h1> +replace_string(<h2 class="SummaryHL">): <h3> +replace_string(<p class="Cat2HL">): <h2> + +# Make extracting the content before "Log in to post comments" easier. +# And by "easier" I mean possible in all cases without going through +# a lot of XPath pain. +replace_string(<hr width="60%" align="left">): <div class="ftrss-strip"> +replace_string(to post comments)): </div> +strip: //div[@class='ftrss-strip'] +body: //div[@class='ArticleText'] +strip: //table[@class='Form'] + +requires_login: yes + +login_uri: https://lwn.net/login +login_username_field: Username +login_password_field: Password + +not_logged_in_xpath: /html/body/div[3]/div[1]/form[@class="loginform"] + +test_url: http://lwn.net/Articles/668318/ +test_url: http://lwn.net/Articles/668695/ +test_url: http://lwn.net/Articles/669114/ +test_url: http://lwn.net/Articles/670209/ +test_url: http://lwn.net/Articles/670209/rss +test_url: http://lwn.net/Articles/668318/rss +test_url: http://lwn.net/Articles/670062/ diff --git a/data/GrabberConfig/m.bbc.co.uk.txt b/data/GrabberConfig/m.bbc.co.uk.txt new file mode 100644 index 00000000..7037c64b --- /dev/null +++ b/data/GrabberConfig/m.bbc.co.uk.txt @@ -0,0 +1,14 @@ +title: //div[@class="story-body"]/div[@class="story-inner"]/h1 +body: //div[@class="story-body"] +date: //p[@class='date']/strong +author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By') + +find_string: http://ichef.bbci.co.uk/news/200/ +replace_string: http://ichef.bbci.co.uk/news/624/ + +find_string: http://ichef.bbci.co.uk/news/304/ +replace_string: http://ichef.bbci.co.uk/news/624/ + +strip: //div[@class="story-inner"]/div[@class="byline"] + +test_url: http://m.bbc.co.uk/news/science-environment-19144464
\ No newline at end of file diff --git a/data/GrabberConfig/m.douban.com.txt b/data/GrabberConfig/m.douban.com.txt new file mode 100644 index 00000000..ce9a3167 --- /dev/null +++ b/data/GrabberConfig/m.douban.com.txt @@ -0,0 +1,13 @@ +# This filter is tested on: +# http://m.douban.com/note/240776310/?session=6ac86d1e +# http://m.douban.com/note/208270705/?session=e00ec732_3433229 + +title: //h2 +author: //a[@class='founder'] +date: substring-after(//span[@class='info'],' | ') +body: //div[contains(@class,'entry item')] + +strip://span[contains(@class,'info')] + +convert_double_br_tags: yes +test_url: http://m.douban.com/note/240776310/?session=6ac86d1e
\ No newline at end of file diff --git a/data/GrabberConfig/m.facebook.com.txt b/data/GrabberConfig/m.facebook.com.txt new file mode 100644 index 00000000..81d2b00e --- /dev/null +++ b/data/GrabberConfig/m.facebook.com.txt @@ -0,0 +1,12 @@ +body: //div[@id='m_story_permalink_view' or contains(@data-sigil, 'm-story-view')] + +title: //div[@id='m_story_permalink_view' or contains(@data-sigil, 'm-story-view')]//h3 + +strip_id_or_class: commentable +strip: //*[contains(@data-sigil, 'm-mentions-expand') or contains(@data-sigil, 'story-popup-context') or contains(@data-sigil, 'share') or contains(@data-sigil, 'translate')] + +prune: no +tidy: no + +test_url: https://m.facebook.com/permalink.php?story_fbid=10154584776550183&id=294468630182 +test_contains: holding an extraordinary session in Brussels this month diff --git a/data/GrabberConfig/m.theregister.co.uk.txt b/data/GrabberConfig/m.theregister.co.uk.txt new file mode 100644 index 00000000..64cb1c32 --- /dev/null +++ b/data/GrabberConfig/m.theregister.co.uk.txt @@ -0,0 +1,4 @@ +strip: //div[@class='wptl btm'] +body: //div[@id='article']//h2 | //div[@id='body'] + +test_url: http://m.theregister.co.uk/2015/07/06/geeks_guide_spaceguard_center/ diff --git a/data/GrabberConfig/m.vanityfair.com.txt b/data/GrabberConfig/m.vanityfair.com.txt new file mode 100644 index 00000000..e47ce2ce --- /dev/null +++ b/data/GrabberConfig/m.vanityfair.com.txt @@ -0,0 +1,11 @@ +# Article Metadata +title: //h1 +author: //span[@class="name"]/a +date: //time + +# Content Pruning +strip: //h5 +strip: //time +strip: //div[@class="byline"] +strip: //h2[@class="headline "] +test_url: http://m.vanityfair.com/politics/2012/10/michael-lewis-profile-barack-obama
\ No newline at end of file diff --git a/data/GrabberConfig/m.wikihow.com.txt b/data/GrabberConfig/m.wikihow.com.txt new file mode 100644 index 00000000..66d08a92 --- /dev/null +++ b/data/GrabberConfig/m.wikihow.com.txt @@ -0,0 +1,14 @@ +body: //div[@id='content_wrapper'] +prune: no +#tidy: no +strip_id_or_class: gatEditSection +strip_id_or_class: relatedwikihows +strip_id_or_class: edit-page +strip_id_or_class: step_num +strip_id_or_class: image_details +strip_id_or_class: mh-method-thumbs-template + +strip: //div[@id='article_rating_mobile'] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' ur_nav_container ')] + +test_url: http://m.wikihow.com/Check-out-a-Used-Car-Before-Buying-It diff --git a/data/GrabberConfig/m.xkcd.com.txt b/data/GrabberConfig/m.xkcd.com.txt new file mode 100644 index 00000000..98f3f651 --- /dev/null +++ b/data/GrabberConfig/m.xkcd.com.txt @@ -0,0 +1,7 @@ +body: //body +prune: no +strip_id_or_class: navButtons +strip_id_or_class: news +strip_id_or_class: footerLinks +strip_id_or_class: altTextLink +test_url: https://m.xkcd.com/2057/ diff --git a/data/GrabberConfig/m00natic.github.io.txt b/data/GrabberConfig/m00natic.github.io.txt new file mode 100644 index 00000000..911fcbd0 --- /dev/null +++ b/data/GrabberConfig/m00natic.github.io.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fm00natic.github.io%2Femacs%2Femacs-wiki.html + +body: //div[@id='content'] +strip_id_or_class: table-of-contents +test_url: https://m00natic.github.io/emacs/emacs-wiki.html
\ No newline at end of file diff --git a/data/GrabberConfig/mac4ever.com.txt b/data/GrabberConfig/mac4ever.com.txt new file mode 100644 index 00000000..87e3c3d8 --- /dev/null +++ b/data/GrabberConfig/mac4ever.com.txt @@ -0,0 +1,9 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.mac4ever.com%2Factu%2F104137_kim-kardashian-et-ses-soeurs-etaient-a-l-apple-store-de-soho-pour-presenter-leur-nouvelle-app + +author: //meta[@name="author"]/@content +date: //meta[@itemprop="datePublished"]/@content + +body: //div[contains(concat(' ',normalize-space(@class),' '),' news-news-content ')] +test_url: http://www.mac4ever.com/actu/104137_kim-kardashian-et-ses-soeurs-etaient-a-l-apple-store-de-soho-pour-presenter-leur-nouvelle-app diff --git a/data/GrabberConfig/macdrifter.com.txt b/data/GrabberConfig/macdrifter.com.txt new file mode 100644 index 00000000..e57bd640 --- /dev/null +++ b/data/GrabberConfig/macdrifter.com.txt @@ -0,0 +1,2 @@ +title: substring-before(//title,' « Macdrifter') +test_url: http://www.macdrifter.com/2012/03/instacast-on-my-mac/
\ No newline at end of file diff --git a/data/GrabberConfig/macg.co.txt b/data/GrabberConfig/macg.co.txt new file mode 100644 index 00000000..a4bfd74a --- /dev/null +++ b/data/GrabberConfig/macg.co.txt @@ -0,0 +1,11 @@ +date: //time[@itemprop="datePublished"]/@datetime + +strip: //div[contains(concat(' ',normalize-space(@class),' '),' plusLoin ')] +strip: //select +strip_id_or_class: comments +strip_id_or_class: smart-paging-pager + +next_page_link: //div[contains(concat(' ',normalize-space(@class),' '),' smart-paging-pager ')]//a[@title='Aller à la page suivante'] + +body: //section[contains(concat(' ',normalize-space(@class),' '),' corps ')] +test_url: http://www.macg.co/aapl/2015/09/tim-cook-rassure-sur-lavenir-du-mac-90957 diff --git a/data/GrabberConfig/macgeneration.com.txt b/data/GrabberConfig/macgeneration.com.txt new file mode 100644 index 00000000..739eff4e --- /dev/null +++ b/data/GrabberConfig/macgeneration.com.txt @@ -0,0 +1,5 @@ +author: substring-before(substring-after(//div[@class='dateNews'],'par '),' le') +date: substring-after(//div[@class='dateNews'],' le ') +body: //div[@class='singleNews zoneApple'] + +test_url: http://www.macgeneration.com/news/voir/211162/dropbox-encore-un-mac-et-deux-comptes-dropbox
\ No newline at end of file diff --git a/data/GrabberConfig/macmagazine.com.br.txt b/data/GrabberConfig/macmagazine.com.br.txt new file mode 100644 index 00000000..da7df695 --- /dev/null +++ b/data/GrabberConfig/macmagazine.com.br.txt @@ -0,0 +1,21 @@ +# Remove sliders +strip: //*[(@class="slides_container")] +strip: //div[(@id="slides_two")] + +# Remove tag cloud +strip: //span[(@class="secao")] + +# Fix date article +# TODO + +# Remove other stuff +strip: //div[(@id="idc-container")] +strip: //div[(@id="idc-noscript")] +strip: //div[(@class="linkwithin_div")] +strip: //div[(@class="navPosts")] +strip: //div[(@id="lateral")] +strip: //div[(@id="autor")] +strip: //div[(@id="rodape")] +strip: //div[(@id="post")]/h1 +strip: //div[(@id="post")]/div[(@id="boxInformacoes")] +test_url: http://macmagazine.com.br/2011/08/01/skype-para-ipad-esta-finalmente-chegando-a-app-store/
\ No newline at end of file diff --git a/data/GrabberConfig/macrumors.com.txt b/data/GrabberConfig/macrumors.com.txt new file mode 100644 index 00000000..83cfb4a6 --- /dev/null +++ b/data/GrabberConfig/macrumors.com.txt @@ -0,0 +1,12 @@ +author: substring-after(//div[@class='byline'], " by ") +date: substring-before(//div[@class='byline'], " by ") + +# set body +body: //div[@class='content'] +strip_id_or_class: commentsContainer +strip_id_or_class: linkback + +# set title +title: //h3 +#strip: //div[@class='content']/h3 +test_url: http://www.macrumors.com/2010/11/10/apple-debuts-new-apple-tv-and-itunes-movie-content-in-japan/ diff --git a/data/GrabberConfig/macstories.net.txt b/data/GrabberConfig/macstories.net.txt new file mode 100644 index 00000000..639fdd19 --- /dev/null +++ b/data/GrabberConfig/macstories.net.txt @@ -0,0 +1,8 @@ +strip: //*[(@id = "featured")] + +author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ') + +date: concat(//div[@class='month'],' ',//div[@class='day']) + +#macstories doesn't provide a year, but month/day is better than nothing +test_url: http://www.macstories.net/news/instapaper-4-0-available-completely-redesigned-ipad-ui-new-features-search-subscription/
\ No newline at end of file diff --git a/data/GrabberConfig/mactalk.com.au.txt b/data/GrabberConfig/mactalk.com.au.txt new file mode 100644 index 00000000..a0cb4eff --- /dev/null +++ b/data/GrabberConfig/mactalk.com.au.txt @@ -0,0 +1,4 @@ +author://div[@class="article_username_container_full"] +date://div[@class="article_username_container"] +body://div[@class="article cms_clear restore postcontainer"] +test_url: http://www.mactalk.com.au/content.php?r=3759-The-iPad-that-still-won-t-die-iOS-9-on-the-iPad-2 diff --git a/data/GrabberConfig/mactechnews.de.txt b/data/GrabberConfig/mactechnews.de.txt new file mode 100644 index 00000000..5c03518a --- /dev/null +++ b/data/GrabberConfig/mactechnews.de.txt @@ -0,0 +1,3 @@ +title: substring-after(substring-after(//title, '>'), '>') +body: //div[@class='NewsArticleContent'] +test_url: http://www.mactechnews.de/news/index/Apple-Pressekonferenz-zum-iPhone-4-147316.html
\ No newline at end of file diff --git a/data/GrabberConfig/macworld.com.txt b/data/GrabberConfig/macworld.com.txt new file mode 100644 index 00000000..e7d97202 --- /dev/null +++ b/data/GrabberConfig/macworld.com.txt @@ -0,0 +1,24 @@ +title: //article//h1 +date: //meta[@name="date"]/@content +author: //div[@class="author-name" or @class="article-byline"]/a[1] + +body: //section[@class="page"] + +# remove 'From the Lab' and 'Recent posts' text +strip: //div[@class='blogLabel'] + +# remove byline and meta info +strip: //div[@class="article-meta"] +strip: //div[@class="author-info"] + +#strip tags and categories +strip: //div[@class="department"] + +#strip product cap links +strip: //div[@class="cap-main"] +strip: //div[@id="compare-lede"] + +prune: no + +# copes less well with Review pages, seems fine for News +test_url: http://www.macworld.com/article/163184/2011/10/the_ipod_as_an_iconic_cultural_force.html
\ No newline at end of file diff --git a/data/GrabberConfig/mainpost.de.txt b/data/GrabberConfig/mainpost.de.txt new file mode 100644 index 00000000..2f6382f1 --- /dev/null +++ b/data/GrabberConfig/mainpost.de.txt @@ -0,0 +1,28 @@ +title: substring-before(//title, '|') +body: //*[@id='content-left'] + +# Why is this not working here? +# body: //*[@id='content-left']/div[@class='content-container'][2]/div[@class='content-body']/div[@class='inner-container']/div[@class='detail'] + + +#Header +strip_id_or_class: 'subHead' +strip_id_or_class: 'fl_right' +strip_id_or_class: 'infolink' +strip_id_or_class: 'content-head' +strip_id_or_class: 'tab' +strip_id_or_class: 'tab-active' +strip: //*[contains(@class,'trenner')] + +# Headline +strip: //h1/* +strip_id_or_class: 'font16' + +#Images +strip_id_or_class: 'leftimage' +strip_id_or_class: 'rightimage' + +#Comments +strip: //table +strip: //p/following-sibling::*[0] +test_url: http://www.mainpost.de/regional/wuerzburg/Autobahnschuetze-Staatsanwalt-fordert-zwoelf-Jahre;art492151,8386332 diff --git a/data/GrabberConfig/maitre-eolas.fr.txt b/data/GrabberConfig/maitre-eolas.fr.txt new file mode 100644 index 00000000..9b428ab7 --- /dev/null +++ b/data/GrabberConfig/maitre-eolas.fr.txt @@ -0,0 +1,5 @@ +body: //div[@class="post-content"] +author: //meta[@name="author"]/@content +date: //meta[@name="date"]/@content + +test_url: http://www.maitre-eolas.fr/post/2018/05/13/Pour-en-finir-avec-les-fiches-S diff --git a/data/GrabberConfig/manager.co.th.txt b/data/GrabberConfig/manager.co.th.txt new file mode 100644 index 00000000..cd6c5c01 --- /dev/null +++ b/data/GrabberConfig/manager.co.th.txt @@ -0,0 +1,26 @@ +title: //td[@class="headline"] +author: //font[@color="#003366"] +date: //td[@class="date"] + +strip: //td[@class="headline"] +strip: //font[@color="#003366"] +strip: //td[@class="date"] + +strip: //img[@src="images/2009/logo_en.gif"] + +body: //tbody[@class="body"] +convert_double_br_tags:yes + +strip: //img[@src="/images/TabOver.gif"] +strip: //td[@width="160"] +strip: //img[@src="/images/TabUnder.gif"] + +strip: //td[@class="small"] +strip: //td[@height="47"] + +strip: //td[@valign="middle"] +strip: //td[@background="/images/menu_bottombg.gif"] +strip: //img[@src="/images/sc_footer_l.gif"] +strip: //img[@src="/images/sc_footer_m.gif"] +strip: //img[@src="/images/sc_footer_r.gif"] +test_url: http://www.manager.co.th/Entertainment/ViewNews.aspx?NewsID=9550000101979
\ No newline at end of file diff --git a/data/GrabberConfig/manga-news.com.txt b/data/GrabberConfig/manga-news.com.txt new file mode 100644 index 00000000..2fc54643 --- /dev/null +++ b/data/GrabberConfig/manga-news.com.txt @@ -0,0 +1,7 @@ +title: //h2[@class='actu-title'] + +body: //div[@class='actu-content'] + +prune: no + +test_url: http://www.manga-news.com/index.php/actus/2016/02/02/Deux-retours-dans-le-planning-des-editions-Pika diff --git a/data/GrabberConfig/marco.org.txt b/data/GrabberConfig/marco.org.txt new file mode 100644 index 00000000..4bb24a62 --- /dev/null +++ b/data/GrabberConfig/marco.org.txt @@ -0,0 +1,8 @@ +tidy: no +prune: no +date: //article//time[@pubdate] +title: //article/header/h2 +body: //article +strip: //header +test_url: http://www.marco.org/2012/09/08/businessweek-gruber +test_url: http://www.marco.org/2012/04/24/might-upgrade-someday
\ No newline at end of file diff --git a/data/GrabberConfig/marksdailyapple.com.txt b/data/GrabberConfig/marksdailyapple.com.txt new file mode 100644 index 00000000..0077f560 --- /dev/null +++ b/data/GrabberConfig/marksdailyapple.com.txt @@ -0,0 +1,2 @@ +strip_id_or_class: wwsgd +test_url: http://www.marksdailyapple.com/are-detoxes-and-cleanses-safe-and-effective/
\ No newline at end of file diff --git a/data/GrabberConfig/marmiton.org.txt b/data/GrabberConfig/marmiton.org.txt new file mode 100644 index 00000000..107427bd --- /dev/null +++ b/data/GrabberConfig/marmiton.org.txt @@ -0,0 +1,6 @@ +title: //h1[@class="main-title"] +author: //span[@class="recipe-author__name"] +body: //div[@id="sticky-desktop-only"] +strip: //div[@id="bloc-video"] + +test_url: http://www.marmiton.org/recettes/recette_gateau-au-chocolat-fondant-rapide_166352.aspx diff --git a/data/GrabberConfig/marriedtothesea.com.txt b/data/GrabberConfig/marriedtothesea.com.txt new file mode 100644 index 00000000..61906555 --- /dev/null +++ b/data/GrabberConfig/marriedtothesea.com.txt @@ -0,0 +1,4 @@ +body: //img[contains(@src, '.gif')] + +test_url: http://www.marriedtothesea.com +test_url: http://www.marriedtothesea.com/index.php?date=010818 diff --git a/data/GrabberConfig/marsactu.fr.txt b/data/GrabberConfig/marsactu.fr.txt new file mode 100644 index 00000000..f2e226c2 --- /dev/null +++ b/data/GrabberConfig/marsactu.fr.txt @@ -0,0 +1,35 @@ + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' entry-title ')] + +body: //div[contains(concat(' ',normalize-space(@class),' '),' content ')] + +author: //div[contains(concat(' ',normalize-space(@class),' '),' td-post-header ')]//small//div[contains(concat(' ',normalize-space(@class),' '),' bestof-meta-autor ')] + +date: //meta[@property="og:updated_time"]/@content + +strip_id_or_class: wc-memberships-content-restricted-message +strip_id_or_class: box_protegee_abonne +strip_id_or_class: share-bottom +strip_id_or_class: share-comment +strip_id_or_class: wpb_wrapper +strip: //ul[@id='actions'] + +test_url: https://marsactu.fr/a-miramas-village-vitrine-en-carton-pate-et-vrai-centre-ville-en-coulisses/ + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' box_protegee_abonne ')] +login_uri: https://marsactu.fr/wp-login.php +login_username_field: log +login_password_field: pwd +login_extra_fields: rememberme=forever +login_extra_fields: wp-submit=Se+connecter +login_extra_fields: testcookie=1 +login_extra_fields: redirect_to=@=xpath('//input[@name="redirect_to"]', request_html('https://marsactu.fr/wp-login.php')) +# Remark: in the previous line, the @=xpath(...) is not necessary because +# the value of "redirect_to" is dynamic but because we need a call +# to request_html('https://marsactu.fr/wp-login.php') in order to +# initialize the session cookie mandatory for the login to succeed. + diff --git a/data/GrabberConfig/martinfowler.com.txt b/data/GrabberConfig/martinfowler.com.txt new file mode 100644 index 00000000..4ff4a9c2 --- /dev/null +++ b/data/GrabberConfig/martinfowler.com.txt @@ -0,0 +1,8 @@ +date: //div[@id="main"]/p[@class="date"] +author: string("Martin Fowler") +body: //div[@id="main"] +strip_id_or_class: date +strip_id_or_class: tags +strip_id_or_class: tagLabel +strip: //div[@id="main"]/h1[1] +test_url: http://martinfowler.com/bliki/DatabaseThaw.html
\ No newline at end of file diff --git a/data/GrabberConfig/mashable.com.txt b/data/GrabberConfig/mashable.com.txt new file mode 100644 index 00000000..b6efb6c5 --- /dev/null +++ b/data/GrabberConfig/mashable.com.txt @@ -0,0 +1,11 @@ +title: //h1[@class='title'] +author: substring-after(//span[@class='author_name'], 'By ') +date: //time + +body: //article +strip: //div[@class='ytm-gallery-box'] +strip: //div[contains(@class, 'adsense')] +strip: //aside[contains(@class, 'social')] +strip_id_or_class: article-topics + +test_url: http://mashable.com/2013/05/24/myspace-architects-rebuilding-a-brand/ diff --git a/data/GrabberConfig/matt.might.net.txt b/data/GrabberConfig/matt.might.net.txt new file mode 100644 index 00000000..30d585cf --- /dev/null +++ b/data/GrabberConfig/matt.might.net.txt @@ -0,0 +1,5 @@ +title: //h1 +author: string("Matt Might") +strip: //h1/following-sibling::div + +test_url: http://matt.might.net/articles/oo-cesk/
\ No newline at end of file diff --git a/data/GrabberConfig/mattcutts.com.txt b/data/GrabberConfig/mattcutts.com.txt new file mode 100644 index 00000000..76b1eac6 --- /dev/null +++ b/data/GrabberConfig/mattcutts.com.txt @@ -0,0 +1,2 @@ +date: //*[@class = 'published'] +test_url: http://www.mattcutts.com/blog/internet-censorship-sopa/
\ No newline at end of file diff --git a/data/GrabberConfig/maxim.com.txt b/data/GrabberConfig/maxim.com.txt new file mode 100644 index 00000000..344866cf --- /dev/null +++ b/data/GrabberConfig/maxim.com.txt @@ -0,0 +1,4 @@ +body: //div[contains(@class, 'field-name-body') or contains(@class, 'featured-image')] + +test_url: http://www.maxim.com/rss-feeds.xml +test_url: http://www.maxim.com/entertainment/article/second-city-chicago-goes-flames
\ No newline at end of file diff --git a/data/GrabberConfig/mbari.org.txt b/data/GrabberConfig/mbari.org.txt new file mode 100644 index 00000000..b84e6e00 --- /dev/null +++ b/data/GrabberConfig/mbari.org.txt @@ -0,0 +1,9 @@ +title: //h1 +body: //section[@class='av_textblock_section'] + +# Delete date stored as an h3 (first of the page) +strip: (//section[@class='av_textblock_section']//h3)[1] + +# Everything else is well done with opengraph data + +test_url: https://www.mbari.org/ctenophore-evolution/ diff --git a/data/GrabberConfig/mbl.is.txt b/data/GrabberConfig/mbl.is.txt new file mode 100644 index 00000000..fd26f091 --- /dev/null +++ b/data/GrabberConfig/mbl.is.txt @@ -0,0 +1,2 @@ +body: //div[class="frett-main"] +test_url: http://mbl.is/frettir/innlent/2012/02/21/litill_munur_a_fargjaldaverdi/
\ No newline at end of file diff --git a/data/GrabberConfig/mdr.de.txt b/data/GrabberConfig/mdr.de.txt new file mode 100644 index 00000000..5aba24a5 --- /dev/null +++ b/data/GrabberConfig/mdr.de.txt @@ -0,0 +1,14 @@ +body: //div[contains(@class, 'section') and contains(@class, 'sectionZ')] +date: //p[@class='timestamp'] + +strip_id_or_class: hidden +strip_id_or_class: conComments +strip_id_or_class: firstMediaFull + +# Will remove all image and video embeds for a cleaner reading experience. +strip_id_or_class: cssBoxTeaserStandard +strip_id_or_class: cssBoxTeaserLink + +test_url: https://www.mdr.de/sachsen-anhalt/dessau/anhalt/todesursache-koethen-herzinfarkt-100.html +test_url: https://www.mdr.de/sachsen-anhalt/magdeburg/magdeburg/ein-job-fuer-jonas-100.html +test_url: https://www.mdr.de/sachsen-anhalt/dessau/anhalt/koethen-demonstration-am-sonntag-100.html diff --git a/data/GrabberConfig/mediacites.fr.txt b/data/GrabberConfig/mediacites.fr.txt new file mode 100644 index 00000000..3e7a0f7e --- /dev/null +++ b/data/GrabberConfig/mediacites.fr.txt @@ -0,0 +1,25 @@ +title: //meta[@name='og:title']/@content + +author: //meta[@name='author']/@content + +body: //div[contains(concat(' ',normalize-space(@class),' '),' td-post-header ')]//*[contains(concat(' ',normalize-space(@class),' '),' td-post-sub-title ')] | //div[contains(concat(' ',normalize-space(@class),' '),' td-post-content ')] + +strip_id_or_class: send-gift-article-email +strip_id_or_class: point-final +strip_id_or_class: at-above-post +strip_id_or_class: at-below-post + +test_url: https://www.mediacites.fr/actu/2017/12/05/lhistoire-de-mediacites-a-laube-de-son-1er-anniversaire/ + +# Wallabag-specific login directives (not supported in FTR): +requires_login: yes +login_uri: https://www.mediacites.fr/mon-compte/ +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' restrictblock ')] +login_username_field: rcp_user_login +login_password_field: rcp_user_pass +login_extra_fields: rcp_login_nonce=@=xpath('//form[@id="rcp_login_form"]//input[@name="rcp_login_nonce"]', request_html('https://www.mediacites.fr/mon-compte/')) +login_extra_fields: rcp_user_remember=1 +login_extra_fields: rcp_action=login +login_extra_fields: rcp_redirect=https://www.mediacites.fr/mon-compte/ +login_extra_fields: rcp_login_submit=Connexion +test_url: https://www.mediacites.fr/toulouse/enquete-toulouse/2018/07/17/les-eaux-troubles-du-canal-du-midi/ diff --git a/data/GrabberConfig/medialens.org.txt b/data/GrabberConfig/medialens.org.txt new file mode 100644 index 00000000..c26bac55 --- /dev/null +++ b/data/GrabberConfig/medialens.org.txt @@ -0,0 +1,5 @@ +strip_id_or_class: article-tools +strip_id_or_class: pagenav +prune: no +test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html +test_contains: In an era of permanent war, economic meltdown diff --git a/data/GrabberConfig/mediapart.fr.txt b/data/GrabberConfig/mediapart.fr.txt new file mode 100644 index 00000000..85db996a --- /dev/null +++ b/data/GrabberConfig/mediapart.fr.txt @@ -0,0 +1,19 @@ +title://h1[@class="title"] +body://section[@class="global-wrapper"]//div[@class="page-pane"] + +date: //div[contains(concat(' ',normalize-space(@class),' '),' author ')]//time/@datetime +author: //div[contains(concat(' ',normalize-space(@class),' '),' author ')]//a[@class='journalist'] + +single_page_link: //link[@rel="canonical"] + +strip: //h2[@class='h4'] + +requires_login: yes + +login_uri: https://www.mediapart.fr/login_check +login_username_field: name +login_password_field: password + +not_logged_in_xpath: //body[contains(@class,"not-logged-in")] + +test_url: https://www.mediapart.fr/journal/france/170116/le-site-slatefr-est-passe-entre-les-mains-du-cac-40 diff --git a/data/GrabberConfig/medium.com.txt b/data/GrabberConfig/medium.com.txt new file mode 100644 index 00000000..2c969c0a --- /dev/null +++ b/data/GrabberConfig/medium.com.txt @@ -0,0 +1,17 @@ +body: //main[contains(@class, 'postArticle-content')] +body: //div[contains(@class, 'postArticle-content')] +body: //div[contains(@class, 'section-inner')] +strip_id_or_class: supplementalPostContent + +parser: html5php + +tidy: no +prune: no + +test_url: https://medium.com/@savolai/kaytettavyyden-haasteet-keskustelukulttuurista-2-3-6844c0d7893b +test_contains: Jos käytettävyysongelmat ovat kerran niin tyypillisiä +test_contains: Keskustelukulttuuriongelmasta (subjective vs. objective bugs) + +test_url: https://medium.com/health-the-future/thirty-things-ive-learned-482765ee3503 +test_contains: Remember you will die +test_contains: You have to have some faith. diff --git a/data/GrabberConfig/megamp3.eu.txt b/data/GrabberConfig/megamp3.eu.txt new file mode 100644 index 00000000..1b6a1279 --- /dev/null +++ b/data/GrabberConfig/megamp3.eu.txt @@ -0,0 +1,8 @@ +title: //h3[@class='episode_title'] +body: //ul[contains(@class, 'episode_imgdesc')]/li/descendant::* +prune: no +strip://*[contains(@class, 'plugin')] +strip://*[contains(@class, 'episode_keywords')] + +test_url: http://www.megamp3.eu/?p=episode&name=2013-04-19_la_filiere_progressive_431.mp3 +test_url: http://www.megamp3.eu/feed.xml diff --git a/data/GrabberConfig/mein-mmo.de.txt b/data/GrabberConfig/mein-mmo.de.txt new file mode 100644 index 00000000..3675e4ac --- /dev/null +++ b/data/GrabberConfig/mein-mmo.de.txt @@ -0,0 +1,4 @@ +body: //div[@class='gp-entry-text'] +strip: //aside + +test_url: http://mein-mmo.de/pokemon-go-15-staerksten-pokemon-der-2-generation/ diff --git a/data/GrabberConfig/menshealth.com.sg.txt b/data/GrabberConfig/menshealth.com.sg.txt new file mode 100644 index 00000000..af450b5e --- /dev/null +++ b/data/GrabberConfig/menshealth.com.sg.txt @@ -0,0 +1,7 @@ +strip: //div[contains(@style, 'float:right') and contains(., 'advertisement')] +body: //div[@style="float:left;width:740px;"] + +tidy: no + +# broken feed? +test_url: http://www.menshealth.com.sg/fitness/feed diff --git a/data/GrabberConfig/meowni.ca.txt b/data/GrabberConfig/meowni.ca.txt new file mode 100644 index 00000000..af5b9642 --- /dev/null +++ b/data/GrabberConfig/meowni.ca.txt @@ -0,0 +1,3 @@ +author: //meta[@name="author"]/@content + +test_url: https://meowni.ca/posts/2017-puppeteer-tests/ diff --git a/data/GrabberConfig/mercurynews.com.txt b/data/GrabberConfig/mercurynews.com.txt new file mode 100644 index 00000000..0ff85632 --- /dev/null +++ b/data/GrabberConfig/mercurynews.com.txt @@ -0,0 +1,10 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-content ')] + +strip: //div[contains(concat(' ',normalize-space(@class),' '),' slideshow-ad-video ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' video-ad ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' meta ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' tags ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' twitter-follow ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' button-center-wrapper ')] +strip: //img[contains(concat(' ',normalize-space(@class),' '),' avatar ')] +strip: //h3[contains(concat(' ',normalize-space(@class),' '),' vcard ')] diff --git a/data/GrabberConfig/mesec.cz.txt b/data/GrabberConfig/mesec.cz.txt new file mode 100644 index 00000000..d947f909 --- /dev/null +++ b/data/GrabberConfig/mesec.cz.txt @@ -0,0 +1,4 @@ +body: //div[@class='urs'] | //div[@itemprop='articleBody'] +strip_id_or_class: promo-in-article + +test_url: http://www.mesec.cz/aktuality/ceske-drahy-pridaji-zamestnancum-jejich-mzdy-vzrostou-o-1-7/ diff --git a/data/GrabberConfig/metafilter.com.txt b/data/GrabberConfig/metafilter.com.txt new file mode 100644 index 00000000..a2f3ada9 --- /dev/null +++ b/data/GrabberConfig/metafilter.com.txt @@ -0,0 +1,8 @@ +body: //div[contains(@class, 'copy') or contains(@class, 'comments')] +strip_id_or_class: related +strip: //a[. = 'Subscribe'] +strip: //h1/span[@class = 'smallcopy'] +strip: //a[@class = 'skip'] +strip: //div[@id = 'logo'] +strip: //div[contains(@class, 'comments') and contains(., 'You are not currently logged in')] +test_url: http://www.metafilter.com/128101/Probably-more-secure-than-the-Drafts-folder-on-a-shared-Gmail-account
\ No newline at end of file diff --git a/data/GrabberConfig/metrocop.net.txt b/data/GrabberConfig/metrocop.net.txt new file mode 100644 index 00000000..9535f125 --- /dev/null +++ b/data/GrabberConfig/metrocop.net.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fmetrocop.net%2Fmythic-comics%2Fissue-1%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ')] +test_url: http://metrocop.net/mythic-comics/issue-1/ diff --git a/data/GrabberConfig/mforum.cari.com.my.txt b/data/GrabberConfig/mforum.cari.com.my.txt new file mode 100644 index 00000000..c295d734 --- /dev/null +++ b/data/GrabberConfig/mforum.cari.com.my.txt @@ -0,0 +1,6 @@ +body: (//td[starts-with(@id, 'postmessage_')])[1] + +prune: no + +test_url: http://mforum.cari.com.my/forum.php?mod=viewthread&tid=788033 +test_url: http://mforum.cari.com.my/forum.php?mod=rss&fid=265&auth=0
\ No newline at end of file diff --git a/data/GrabberConfig/mikeash.com.txt b/data/GrabberConfig/mikeash.com.txt new file mode 100644 index 00000000..abaa6a81 --- /dev/null +++ b/data/GrabberConfig/mikeash.com.txt @@ -0,0 +1,5 @@ +title: //div[@class="blogtitle"] +strip: //div[@class="blogtitle"] + +author: substring-after(//span[@class="blogheader"], 'Author: ') +test_url: http://www.mikeash.com/pyblog/friday-qa-2012-01-13-the-mac-toolbox.html
\ No newline at end of file diff --git a/data/GrabberConfig/mikeindustries.com.txt b/data/GrabberConfig/mikeindustries.com.txt new file mode 100644 index 00000000..fb4636cc --- /dev/null +++ b/data/GrabberConfig/mikeindustries.com.txt @@ -0,0 +1,9 @@ +title: //div[@class='post_content']/h2 +date: //div[@class='dateline'] +body: //div[@class='entry'] + +strip: //div[@class='closer'] +strip: //div[@class='navigation'] +strip: //div[@class='aux_pane'] +strip: //div[@class='aux_aux_pane'] +test_url: http://www.mikeindustries.com/blog/archive/2011/10/never-be-another
\ No newline at end of file diff --git a/data/GrabberConfig/minnesota.publicradio.org.txt b/data/GrabberConfig/minnesota.publicradio.org.txt new file mode 100644 index 00000000..773a627c --- /dev/null +++ b/data/GrabberConfig/minnesota.publicradio.org.txt @@ -0,0 +1,10 @@ +title: //*[@class="article"]/h1 +date: //*[@class="article"]/div[@class="date"] + +# strip the title and date from the article text +strip: //*[@class="article"]/h1 +strip: //*[@class="article"]/div[@class="date"] + +# strip annoying <br> between metadata and article +strip: //*[@class="article"]/div[@class="date"]/following-sibling::br +test_url: http://minnesota.publicradio.org/display/web/2012/06/19/health/senators-want-health-care-ruling-on-tv/
\ No newline at end of file diff --git a/data/GrabberConfig/minnpost.com.txt b/data/GrabberConfig/minnpost.com.txt new file mode 100644 index 00000000..dc926a6f --- /dev/null +++ b/data/GrabberConfig/minnpost.com.txt @@ -0,0 +1,5 @@ +title: //*[@id="content-header"]/h1 +author: //*[contains(@class, 'byline')]/a/text() +date: substring-after(//*[contains(@class, 'byline')]/text()[2], '|') +body: //*[contains(@class, 'node-body')] +test_url: http://www.minnpost.com/eric-black-ink/2012/06/overturning-obamacare-would-be-game-changer-supreme-court
\ No newline at end of file diff --git a/data/GrabberConfig/mirrorfootball.co.uk.txt b/data/GrabberConfig/mirrorfootball.co.uk.txt new file mode 100644 index 00000000..2033cf33 --- /dev/null +++ b/data/GrabberConfig/mirrorfootball.co.uk.txt @@ -0,0 +1,3 @@ +# Remove extra links +strip: //*[@class='appended_html'] +test_url: http://www.mirrorfootball.co.uk/news/West-Ham-crisis-Carlton-Cole-slams-diabolical-performance-and-rips-into-Avram-Grant-lack-of-tactical-nous-following-Liverpool-mauling-article636151.html
\ No newline at end of file diff --git a/data/GrabberConfig/mises.org.txt b/data/GrabberConfig/mises.org.txt new file mode 100644 index 00000000..73c485e6 --- /dev/null +++ b/data/GrabberConfig/mises.org.txt @@ -0,0 +1,5 @@ +strip_id_or_class: 'book-ad' +strip_id_or_class: 'bigger pullquote' +strip_id_or_class: 'subscribe' +strip_id_or_class: 'blog-link' +test_url: http://mises.org/daily/4804
\ No newline at end of file diff --git a/data/GrabberConfig/mithatkonar.com.txt b/data/GrabberConfig/mithatkonar.com.txt new file mode 100644 index 00000000..fcf190f0 --- /dev/null +++ b/data/GrabberConfig/mithatkonar.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fmithatkonar.com%2Fwiki%2Fdoku.php%2Fkicad%2Fkicad_spice_quick_guide + +body: //div[contains(concat(' ',normalize-space(@class),' '),' page ')] +test_url: http://mithatkonar.com/wiki/doku.php/kicad/kicad_spice_quick_guide diff --git a/data/GrabberConfig/mlb.sbnation.com.txt b/data/GrabberConfig/mlb.sbnation.com.txt new file mode 100644 index 00000000..8480e302 --- /dev/null +++ b/data/GrabberConfig/mlb.sbnation.com.txt @@ -0,0 +1,14 @@ +title: //h1[@id = 'stream_title'] +author: //p[@class = 'byline']/a +date: //span[@class = 'datetime'] + +body: //div[@id = 'stream_container'] +strip: //p[@class = 'byline'] +strip_id_or_class: stream_summary +strip_id_or_class: social-spoken +strip_id_or_class: datetime +strip_id_or_class: author-mini-profile +strip_id_or_class: social-tools +strip_id_or_class: entry-tags +strip_id_or_class: fb-like-box +test_url: http://mlb.sbnation.com/2011/10/17/2495845/2011-world-series-st-louis-cardinals-texas-rangers-home-field-advantage
\ No newline at end of file diff --git a/data/GrabberConfig/mlssoccer.com.txt b/data/GrabberConfig/mlssoccer.com.txt new file mode 100644 index 00000000..5d706f88 --- /dev/null +++ b/data/GrabberConfig/mlssoccer.com.txt @@ -0,0 +1,6 @@ +title: //*[@class="header_title"]/h1 +date: //*[@class="field-date"] +author: //*[@class="field-author"] +body: //div[contains(@class, 'content')] + +test_url: http://www.mlssoccer.com/news/article/2012/06/19/lack-depth-front-forces-arena-alter-las-formation
\ No newline at end of file diff --git a/data/GrabberConfig/mmo-champion.com.txt b/data/GrabberConfig/mmo-champion.com.txt new file mode 100644 index 00000000..50d8a24f --- /dev/null +++ b/data/GrabberConfig/mmo-champion.com.txt @@ -0,0 +1,5 @@ +title: //h1 +body: //div[@id = 'article_content']/div[contains(@class,'article')] +author: //sub[@class = 'article_promoted_text']/a[starts-with(@href, 'member')] +date: //div[@class = 'article_username_container'] +test_url: http://www.mmo-champion.com/content/2688-Other-Press-Tour-Interviews-A-Night-in-Mists-of-Pandaria-Blue-Posts-MoP-Screenshot
\ No newline at end of file diff --git a/data/GrabberConfig/mnn.com.txt b/data/GrabberConfig/mnn.com.txt new file mode 100644 index 00000000..d3576df2 --- /dev/null +++ b/data/GrabberConfig/mnn.com.txt @@ -0,0 +1,11 @@ +tidy: no +author: //div[@id="above-content"]//img/@alt | //div[@class="comment-auth"]/span[1]/a/text() +date: //div[@class="comment-auth"]/div | //div[@class="comment-auth"]/span[2] +body: //div[@class="node"] + +strip_id_or_class: vertical-social-bar +strip_id_or_class: blogs_paginator +strip_id_or_class: horizontal-social-links +strip_id_or_class: servicelinksdiv + +test_url: http://www.mnn.com/green-tech/research-innovations/blogs/5-breakthroughs-that-will-make-solar-power-cheaper-than-coal
\ No newline at end of file diff --git a/data/GrabberConfig/mno.hu.txt b/data/GrabberConfig/mno.hu.txt new file mode 100644 index 00000000..8a3f9391 --- /dev/null +++ b/data/GrabberConfig/mno.hu.txt @@ -0,0 +1,14 @@ +title: //title + +author: //div[@class="author"] + +strip_id_or_class: 'header' +strip_id_or_class: 'cikk_ajanlo' +strip_id_or_class: 'buttons' +strip_id_or_class: 'related' +strip_id_or_class: 'adbox ad_cikk_kozepre' +strip_id_or_class: 'cikk-cimkek' +strip_id_or_class: 'cikk_ertekeles' + +strip_comments: yes +test_url: http://mno.hu/grund/a-gumibottal-hadonaszo-rendort-joval-konnyebb-utalni-1055351
\ No newline at end of file diff --git a/data/GrabberConfig/mobile.lemondeinformatique.fr.txt b/data/GrabberConfig/mobile.lemondeinformatique.fr.txt new file mode 100644 index 00000000..24aec5c3 --- /dev/null +++ b/data/GrabberConfig/mobile.lemondeinformatique.fr.txt @@ -0,0 +1,6 @@ +title: //h2 +body: div[@id='illustration'] | //p +prune: no +tidy: no + +test_url: http://mobile.lemondeinformatique.fr/actualites/lire-les-datacenters-d-apple-google-et-facebook-eco-responsables-selon-greenpeace-le-monde-informatique-57122.html diff --git a/data/GrabberConfig/mobile.nytimes.com.txt b/data/GrabberConfig/mobile.nytimes.com.txt new file mode 100644 index 00000000..c815f0a3 --- /dev/null +++ b/data/GrabberConfig/mobile.nytimes.com.txt @@ -0,0 +1,71 @@ +# mobile.nytimes.com appears to be the same as www.nytimes.com now, +# so any changes here should probably also be made to mobile.nytimes.com.txt too + +title://h1[@class="articleHeadline"] +body: //div[contains(concat(' ',normalize-space(@class),' '),' story-body ')] +body://div[@id="article"] +body://*[@itemprop="articleBody"] +body: //div[contains(concat(' ',normalize-space(@class),' '),' g-body-article-container ')] +body: //article[@id='story'] +strip_id_or_class:articleTools +strip_id_or_class:readerscomment +#strip://div[contains(@class, "articleInline runaroundLeft")] +strip: //div[contains(@class, "doubleRule")] +# strip image credit - appears as a bold heading +strip: //div[contains(@class, "articleInline")]//h6 +strip_id_or_class:enlargeThis +strip_id_or_class:pageLinks +strip_id_or_class:memberTools +strip_id_or_class:articleExtras +strip_id_or_class:singleAd +strip_id_or_class:byline +strip_id_or_class:dateline +strip_id_or_class:articleheadline +strip_id_or_class:articleBottomExtra +strip_id_or_class:shareTools +strip_id_or_class:story-meta +strip_id_or_class:related-coverage +strip_id_or_class:ad-header +strip_id_or_class:bottom-ad +strip_id_or_class:advert_item +strip://a[contains(@href, 'nytimes.com/adx/')] +strip: //nyt_byline +strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')] +strip: //p[@class='caption']//a[contains(., 'More Photos')] +strip_id_or_class: ResponsiveAd + +strip_id_or_class: robots-nocontent +strip_id_or_class: hidden + +prune: no +tidy: no + +date: //meta[@property="article:published"]/@content +date: //meta[@itemprop="datePublished"]/@content + +find_string: src='https://static01.nyt.com/packages/flash/multimedia/ICONS/transparent.png +replace_string: ignore-src='https://static01.nyt.com/packages/flash/multimedia/ICONS/transparent.png +find_string: data-mediaviewer-src='https://static01.nyt.com +replace_string: src='https://static01.nyt.com + +single_page_link: //link[contains(@href, 'pagewanted=all')] +#mobile.nytimes.com looks same as regular www.nytimes.com now +#single_page_link: //link[@rel='alternate' and contains(@href, 'mobile.nytimes.com')]/@href +#single_page_link: concat(substring-before(//div[@id='pageLinks']//a[contains(@href, 'pagewanted=')]/@href, 'pagewanted='), 'pagewanted=all') + +strip://h6[@class = 'kicker'] + +test_url: http://mobile.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html +test_contains: In this column I want to look at a not uncommon way of writing + +test_url: http://mobile.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html +test_contains: IF you’ve seen enough of Aaron Sorkin’s theater + +test_url: https://mobile.nytimes.com/interactive/2016/books/review/best-books.html +test_contains: invention and speculation flow together + +test_url: http://mobile.nytimes.com/2013/03/25/world/middleeast/israeli-military-responds-after-patrols-come-under-fire-from-syria.html +test_url: http://mobile.nytimes.com/2013/08/15/nyregion/when-the-new-york-city-subway-ran-without-rails.html +test_url: http://mobile.nytimes.com/2004/02/29/weekinreview/correspondence-class-consciousness-china-s-wealthy-live-creed-hobbes-darwin-meet.html +test_url: http://mobile.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html +test_url: https://mobile.nytimes.com/interactive/2015/12/16/upshot/100000004092329.app.html?_r=2 diff --git a/data/GrabberConfig/mobile.twitter.com.txt b/data/GrabberConfig/mobile.twitter.com.txt new file mode 100644 index 00000000..a4ceeeb9 --- /dev/null +++ b/data/GrabberConfig/mobile.twitter.com.txt @@ -0,0 +1,13 @@ +# mobile site (automatic redirect - noscript meta refresh) +title: concat("Tweet from ", (//*[contains(@class, 'UserNames-displayName') or contains(@class, 'fullname')])[1]) +author: (//*[contains(@class, 'UserNames-displayName') or contains(@class, 'fullname')])[1] +body: (//div[contains(@class, 'TweetDetail-text') or contains(@class, 'tweet-text')])[1] +date: (//div[contains(@class, 'TweetDetail-timeAndGeo') or contains(@class, 'metadata')])[1] + +parser: html5php + +prune: no +tidy: yes + +test_url: https://mobile.twitter.com/medialens/status/216883678582804480 +test_contains: is all but alone in challenging the tsunami of UK diff --git a/data/GrabberConfig/mobilenet.cz.txt b/data/GrabberConfig/mobilenet.cz.txt new file mode 100644 index 00000000..b7bb1355 --- /dev/null +++ b/data/GrabberConfig/mobilenet.cz.txt @@ -0,0 +1,3 @@ +next_page_link: //a[@class='navChapters__next'] + +test_url: http://mobilenet.cz/clanky/recenze-apple-ipad-pro-tesne-pod-vrcholem-28955 diff --git a/data/GrabberConfig/mobileopportunity.blogspot.com.txt b/data/GrabberConfig/mobileopportunity.blogspot.com.txt new file mode 100644 index 00000000..82da4aec --- /dev/null +++ b/data/GrabberConfig/mobileopportunity.blogspot.com.txt @@ -0,0 +1,11 @@ +body: //div[@class='post uncustomized-post-template'] + +# remove duplicate of post title, which is a link +strip: //h3[@class='post-title'] + +# remove permalink and timestamp, which isn't useful as it's a time with no date +strip: //span[@class='post-timestamp'] + +# remove labels (tags) +strip: //span[@class='post-labels'] +test_url: http://mobileopportunity.blogspot.com/2010/12/rims-q3-financials-tale-of-two.html
\ No newline at end of file diff --git a/data/GrabberConfig/mobilmania.cz.txt b/data/GrabberConfig/mobilmania.cz.txt new file mode 100644 index 00000000..f2c571c9 --- /dev/null +++ b/data/GrabberConfig/mobilmania.cz.txt @@ -0,0 +1,7 @@ +body: //*[@class='ar-annotation'] | //div[contains(@class='ar-content')] +strip_id_or_class: ar-link-to-another +strip_id_or_class: ar-tags +next_page_link: //a[@data-tracker='Navigace,NextChapter'] + +test_url: http://www.mobilmania.cz/clanky/10-tipu-na-nejlepsi-tablet-pod-stromecek-vanoce-2015/sc-3-a-1332803/default.aspx +test_url: http://www.mobilmania.cz/clanky/tyden-mobilne-311-kyklop-od-asusu-a-fenomen-5g/sc-3-a-1332804/default.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/modernghana.com.txt b/data/GrabberConfig/modernghana.com.txt new file mode 100644 index 00000000..306ef8d9 --- /dev/null +++ b/data/GrabberConfig/modernghana.com.txt @@ -0,0 +1,8 @@ +title: //meta[@property="og:title"]/@content +author: //meta[@name="author"]/@content +date: //span[@class='date1'] +body: //div[@id='newsimage'] | //div[@id='bodytext'] +tidy: no +prune: no + +test_url: http://www.modernghana.com/news/323765/1/039ghost039-teachers-removed-salaries-allowances-p.html
\ No newline at end of file diff --git a/data/GrabberConfig/momentumsaga.com.txt b/data/GrabberConfig/momentumsaga.com.txt new file mode 100644 index 00000000..83c29884 --- /dev/null +++ b/data/GrabberConfig/momentumsaga.com.txt @@ -0,0 +1,5 @@ +title: //div[contains(concat(' ',normalize-space(@class),' '),' post-header ')]//h1 +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-entry ')] +date: //span[contains(concat(' ',normalize-space(@class),' '),' date ') + +test_url: http://www.momentumsaga.com/2015/03/meus-dez-livros-preferidos-de-ficcao-cientifica.html diff --git a/data/GrabberConfig/monde-diplomatique.fr.txt b/data/GrabberConfig/monde-diplomatique.fr.txt new file mode 100644 index 00000000..f3469f9d --- /dev/null +++ b/data/GrabberConfig/monde-diplomatique.fr.txt @@ -0,0 +1,19 @@ +title://h1[@class="h1"] +body: //div[@class="contenu-principal"]/div[@class="texte"] + +requires_login: yes + +login_uri: https://lecteurs.mondediplo.net?page=connexion_sso +login_username_field: email +login_password_field: mot_de_passe + +login_extra_fields: page=connexion +login_extra_fields: formulaire_action=identification_sso +login_extra_fields: formulaire_action_args=@=xpath("//form//input[@name='formulaire_action_args']", request_html(config.getLoginUri())) +login_extra_fields: retour=http://www.monde-diplomatique.fr/ +login_extra_fields: site_distant=http://www.monde-diplomatique.fr/ +login_extra_fields: valider=valider + +not_logged_in_xpath: //div[@id="paywall"] + +test_url: https://blog.mondediplo.net/2017-01-13-Les-vrais-responsables-des-fausses-nouvelles diff --git a/data/GrabberConfig/money.cnn.com.txt b/data/GrabberConfig/money.cnn.com.txt new file mode 100644 index 00000000..d5e03d20 --- /dev/null +++ b/data/GrabberConfig/money.cnn.com.txt @@ -0,0 +1,24 @@ +title: //meta[@property="og:title"]/@content +title: //h1[@class='storyheadline'] +author: //meta[@name="AUTHOR"]/@content +date: //span[@class='cnnDateStamp'] +date: //meta[@name="DATE"]/@content +body: //div[@id='storytext' or @class='storytext'] + +strip_id_or_class: ie_column +strip_id_or_class: sharewidgets +strip_image_src: bug.gif + +strip: //div[@class="hed_side"] +strip: //span[@class="byline"] +strip: //a[@class="soc-twtname"] +strip: //span[@class="cnnDateStamp"] +strip: //div[@class="storytimestamp"] +strip: //div[@class="cnnCol_side"] + +prune: no +tidy: no + +test_url: http://money.cnn.com/2011/03/15/news/companies/steve_jobs_thought_process.fortune/index.htm?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29 +test_url: http://money.cnn.com/2012/01/27/markets/markets_newyork/index.htm +test_url: http://money.cnn.com/2012/05/13/technology/yahoo-ceo-out-rumor/index.htm
\ No newline at end of file diff --git a/data/GrabberConfig/monkeyzen.com.txt b/data/GrabberConfig/monkeyzen.com.txt new file mode 100644 index 00000000..f779c38e --- /dev/null +++ b/data/GrabberConfig/monkeyzen.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://monkeyzen.com/2011/09/siluetas-de-clasicos-a-modo-de-vinilos
\ No newline at end of file diff --git a/data/GrabberConfig/montelimar-news.fr.txt b/data/GrabberConfig/montelimar-news.fr.txt new file mode 100644 index 00000000..87a196c9 --- /dev/null +++ b/data/GrabberConfig/montelimar-news.fr.txt @@ -0,0 +1,12 @@ + +title: //a[contains(concat(' ',normalize-space(@class),' '),' titre_news ')] +body: //div[@id='titre'] + +find_string: lang="en-US" +replace_string: lang="fr-FR" + +strip_id_or_class: titre_news +strip_id_or_class: group1 + +test_url: http://www.montelimar-news.fr/article/parfum-de-jazz-en-drome-provencale/1/9654.html#955c5 + diff --git a/data/GrabberConfig/moo.nac.uci.edu.txt b/data/GrabberConfig/moo.nac.uci.edu.txt new file mode 100644 index 00000000..24c949e9 --- /dev/null +++ b/data/GrabberConfig/moo.nac.uci.edu.txt @@ -0,0 +1,9 @@ +title: //div[@id='header']//h1[1] + +body: //div[@id='content'] + +strip_id_or_class: toc + +prune: no + +test_url: http://moo.nac.uci.edu/~hjm/HOWTO_move_data.html diff --git a/data/GrabberConfig/moonsault.de.txt b/data/GrabberConfig/moonsault.de.txt new file mode 100644 index 00000000..55026eeb --- /dev/null +++ b/data/GrabberConfig/moonsault.de.txt @@ -0,0 +1,13 @@ +strip_image_src: menu +strip_image_src: templates +strip: //div/a +strip: //div/b +strip: //div/strong +strip: //td[@width='30%'] +strip: //br[1] +strip: //br[2] +strip: //br[3] +strip: //br[4] +strip: //a[@href='http://www.moonsault.de/newzboard/index.php?act=home'] +strip_id_or_class: cse-branding-right +test_url: http://www.moonsault.de/newzboard/index.php?news=22321&act=previous
\ No newline at end of file diff --git a/data/GrabberConfig/mothering.com.txt b/data/GrabberConfig/mothering.com.txt new file mode 100644 index 00000000..a34adff7 --- /dev/null +++ b/data/GrabberConfig/mothering.com.txt @@ -0,0 +1,7 @@ +title: //h2[contains(@class,'post_headline')] +body: //div[@class='entry'] +convert_double_br_tags: yes +strip_image_src: _selected.gif +strip_id_or_class: addthis_ +strip: //a[contains(@href,'feedburner.com')] +test_url: http://mothering.com/all-things-mothering/inspiration/motherhood-brings-me-down
\ No newline at end of file diff --git a/data/GrabberConfig/motherjones.com.txt b/data/GrabberConfig/motherjones.com.txt new file mode 100644 index 00000000..851feb7e --- /dev/null +++ b/data/GrabberConfig/motherjones.com.txt @@ -0,0 +1,15 @@ +title: //h1 +body: //div[@id = 'content-area'] +next_page_link: //div[@class='node-pager']/a[contains(@class, 'next')] +tidy: no +author: //p[contains(@class, 'byline')]/a + +strip_id_or_class: node-header +strip_id_or_class: hdr-tools +strip_id_or_class: node-body-break +strip_id_or_class: pullquote +strip_id_or_class: node-pager +strip_id_or_class: author-bio +strip_id_or_class: node-footer + +test_url: http://motherjones.com/politics/2012/02/mac-mcclelland-free-online-shipping-warehouses-labor
\ No newline at end of file diff --git a/data/GrabberConfig/moto-net.com.txt b/data/GrabberConfig/moto-net.com.txt new file mode 100644 index 00000000..8c337a8a --- /dev/null +++ b/data/GrabberConfig/moto-net.com.txt @@ -0,0 +1,11 @@ + +# body composed of: +# - "accroche" : the introduction text +# - "presentation-class-wrapper"/h2 : the subtitle of each page for multipage articles +# - "field-name-body" : the actual article content +body: //div[contains(concat(' ',normalize-space(@class),' '),' accroche ')] | //div[contains(concat(' ',normalize-space(@class),' '),' presentation-class-wrapper ')]/h2 | //div[contains(concat(' ',normalize-space(@class),' '),' field-name-body ')] + +# the next page is the link contained after the "current" "book_navigation_item" +next_page_link: //li[contains(concat(' ',normalize-space(@class),' '),' book_navigation_item ') and contains(concat(' ',normalize-space(@class),' '),' current ')]/following-sibling::li[1]/a/@href + +test_url: http://www.moto-net.com/article/comparatif-superbike-2018-aprilia-rsv4-rf-vs-bmw-s1000rr-vs-ducati-panigale-v4-s-1-comparo-sbk-2018-page-1-les-nouvelles-du-vieux-continent.html diff --git a/data/GrabberConfig/motorfull.com.txt b/data/GrabberConfig/motorfull.com.txt new file mode 100644 index 00000000..c6bec7e9 --- /dev/null +++ b/data/GrabberConfig/motorfull.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://motorfull.com/2011/09/aparca-valeo-park4u-remote
\ No newline at end of file diff --git a/data/GrabberConfig/motorsport-magazin.com.txt b/data/GrabberConfig/motorsport-magazin.com.txt new file mode 100644 index 00000000..fed79224 --- /dev/null +++ b/data/GrabberConfig/motorsport-magazin.com.txt @@ -0,0 +1,11 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.motorsport-magazin.com%2Fdtm%2Fnews-246125-dtm-2018-rahmenprogramm-klassik-tourenwagen%2F + +body: //article +strip: //ins +strip: //aside +strip: //footer +strip_id_or_class: twitter-tweet + +test_url: https://www.motorsport-magazin.com/dtm/news-246125-dtm-2018-rahmenprogramm-klassik-tourenwagen/ diff --git a/data/GrabberConfig/movie.douban.com.txt b/data/GrabberConfig/movie.douban.com.txt new file mode 100644 index 00000000..eae211ed --- /dev/null +++ b/data/GrabberConfig/movie.douban.com.txt @@ -0,0 +1,12 @@ +# This filter is tested on: +# http://movie.douban.com/review/1062013/ + +title: //span[contains(@property, 'v:summary')] +author: //span[contains(@property, 'v:reviewer')] +date://span[contains(@property, 'v:dtreviewed')] +body://div[contains(@class, 'main-bd')] + +strip://img[contains(@class,'rating')]|//img[contains(@class,'review-stat')] +convert_double_br_tags: yes +test_url: http://movie.douban.com/review/1062013/ +test_url: http://movie.douban.com/review/1021870/
\ No newline at end of file diff --git a/data/GrabberConfig/msdn.microsoft.com.txt b/data/GrabberConfig/msdn.microsoft.com.txt new file mode 100644 index 00000000..e28840ff --- /dev/null +++ b/data/GrabberConfig/msdn.microsoft.com.txt @@ -0,0 +1,5 @@ +body: //div[@id="mainBody"] +prune: no +test_url: http://msdn.microsoft.com/en-us/library/hh542796(VS.103).aspx +test_url: https://msdn.microsoft.com/library/hh191443(vs.110).aspx +test_contains: An async method typically contains diff --git a/data/GrabberConfig/msnbc.msn.com.txt b/data/GrabberConfig/msnbc.msn.com.txt new file mode 100644 index 00000000..f008d2d1 --- /dev/null +++ b/data/GrabberConfig/msnbc.msn.com.txt @@ -0,0 +1,21 @@ +title: //title +author: //div[@id='byline'] + +date: //div[contains(@class,'timestamp')]/abbr/text() + +body: //div[@id='intellitTXT'] + +strip: //div[@id='byline'] +strip: //div[contains(@class,'timestamp')] +strip: //div[contains(@class, 'ad-label')] +strip: //div[contains(@class, 'ad-break')] +strip: //span[contains(@class, 'x-video')] +strip: //span[contains(@class, 'inline')] +strip: //div[contains(@class, 'video')] +strip: //div[contains(@class, 'discuss')] +strip: //div[@id='most-popular'] +strip: //div[contains(@class,'drawer')] +strip: //*[contains(@class, 'hide')] + +footnotes: no +test_url: http://www.msnbc.msn.com/id/44748412/ns/business-world_business/#.TolUv-vfDbE
\ No newline at end of file diff --git a/data/GrabberConfig/mtlblog.com.txt b/data/GrabberConfig/mtlblog.com.txt new file mode 100644 index 00000000..ad4113b2 --- /dev/null +++ b/data/GrabberConfig/mtlblog.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.mtlblog.com%2Flifestyle%2F11-super-fun-hidden-swimming-holes-close-to-montreal-you-have-to-road-trip-to + +body: //div[@id='article-text'] +test_url: https://www.mtlblog.com/lifestyle/11-super-fun-hidden-swimming-holes-close-to-montreal-you-have-to-road-trip-to
\ No newline at end of file diff --git a/data/GrabberConfig/multinationales.org.txt b/data/GrabberConfig/multinationales.org.txt new file mode 100644 index 00000000..c1f6e70b --- /dev/null +++ b/data/GrabberConfig/multinationales.org.txt @@ -0,0 +1,8 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' main ')] | //div[contains(concat(' ',normalize-space(@class),' '),' notes ')] + +date: //time[@pubdate='pubdate']/@datetime + +author: //span[contains(concat(' ',normalize-space(@class),' '),' author ')]//a + +test_url: http://multinationales.org/After-their-attacks-on-climate-science-industrial-lobbyists-target-the diff --git a/data/GrabberConfig/muycomputerpro.com.txt b/data/GrabberConfig/muycomputerpro.com.txt new file mode 100644 index 00000000..19845ded --- /dev/null +++ b/data/GrabberConfig/muycomputerpro.com.txt @@ -0,0 +1,4 @@ +title: //h1[@class='entry-title'] +body: //div[@id='mvp-content-main'] + +test_url: https://www.muycomputerpro.com/2017/09/20/fundacion-universidad-empresa-hyperloop-one diff --git a/data/GrabberConfig/muyinteresante.es.txt b/data/GrabberConfig/muyinteresante.es.txt new file mode 100644 index 00000000..0e2cb594 --- /dev/null +++ b/data/GrabberConfig/muyinteresante.es.txt @@ -0,0 +1,9 @@ +title: //h1[@class='article--title'] +body: //div[@id='paragraphs'] + +strip: //figure +strip: //div[contains(concat(' ',normalize-space(@class),' '),' article--info ')] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' distinguished--content ')] +strip: //div[@class='distinguished'] + +test_url: https://www.muyinteresante.es/naturaleza/articulo/los-5-fenomenos-meteorologicos-mas-mortales-521495629629 diff --git a/data/GrabberConfig/muylinux.com.txt b/data/GrabberConfig/muylinux.com.txt new file mode 100644 index 00000000..7e648720 --- /dev/null +++ b/data/GrabberConfig/muylinux.com.txt @@ -0,0 +1,9 @@ +title: //h1[@class='entry-title'] +body: //div[@class='text'] +author: //li[@class='entry-author']//a +date: //li[@class='entry-date'] + +# first image inside the article +strip: //div[@class='text']//img[1] + +test_url: http://www.muylinux.com/2016/03/22/kde-plasma-5-6 diff --git a/data/GrabberConfig/myfoxatlanta.com.txt b/data/GrabberConfig/myfoxatlanta.com.txt new file mode 100644 index 00000000..8a7590ab --- /dev/null +++ b/data/GrabberConfig/myfoxatlanta.com.txt @@ -0,0 +1,5 @@ +body: //div[@id='WNStoryBody'] +author: //div[@id='WNStoryByline'] +prune: no + +test_url: http://www.myfoxatlanta.com/category/233685/local-news?clienttype=rss
\ No newline at end of file diff --git a/data/GrabberConfig/myrecipes.com.txt b/data/GrabberConfig/myrecipes.com.txt new file mode 100644 index 00000000..956be1e6 --- /dev/null +++ b/data/GrabberConfig/myrecipes.com.txt @@ -0,0 +1,12 @@ +title: //h2[contains(@class, 'name')] +body: //div[@class='printFullPageContentContainer']//div[contains(@class, 'recipe')] + +strip_id_or_class: photoBy +strip_id_or_class: link + +single_page_link: //li[@class='print']/a[contains(@href, '/print/')] + +prune: no +tidy: no + +test_url: http://www.myrecipes.com/recipe/hummingbird-cake-10000000387218/
\ No newline at end of file diff --git a/data/GrabberConfig/n.survol.fr.txt b/data/GrabberConfig/n.survol.fr.txt new file mode 100644 index 00000000..fb81ed70 --- /dev/null +++ b/data/GrabberConfig/n.survol.fr.txt @@ -0,0 +1,4 @@ +title: //h1 +date: //header//time/@datetime + +test_url: https://n.survol.fr/n/gerer-son-potager diff --git a/data/GrabberConfig/n0where.net.txt b/data/GrabberConfig/n0where.net.txt new file mode 100644 index 00000000..1fec44b0 --- /dev/null +++ b/data/GrabberConfig/n0where.net.txt @@ -0,0 +1,7 @@ +title://div[@class='main-title single-title entry-title'] +body://div[@id="content-anchor-inner"] +date://meta[@content] + +strip: //noscript + +test_url: https://n0where.net/dump-and-analyze-net-applications-memory-memoscope-net/ diff --git a/data/GrabberConfig/nachdenkseiten.de.txt b/data/GrabberConfig/nachdenkseiten.de.txt new file mode 100644 index 00000000..c1bc0a12 --- /dev/null +++ b/data/GrabberConfig/nachdenkseiten.de.txt @@ -0,0 +1,12 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.nachdenkseiten.de%2F%3Fp%3D44450 + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ')] +title: //h3 +strip_id_or_class: spendenfoerdern +strip_id_or_class: thetagsov +strip_id_or_class: powerpress_player +strip_id_or_class: upTop + +test_url: https://www.nachdenkseiten.de/?p=44450 diff --git a/data/GrabberConfig/nachrichten.at.txt b/data/GrabberConfig/nachrichten.at.txt new file mode 100644 index 00000000..64015ca9 --- /dev/null +++ b/data/GrabberConfig/nachrichten.at.txt @@ -0,0 +1,8 @@ +single_page_link: //a[@class="drucken"] + +body: //div[@class="artikeldruck"] + +strip: //div[@class="druckheadline"] +strip: //div[@class="druckfuss"] + +test_url: http://www.nachrichten.at/nachrichten/chronik/Silvester-in-der-Bundeshauptstadt-Anti-Terror-Einheiten-ruesten-sich;art58,2069166 diff --git a/data/GrabberConfig/naiz.eus.txt b/data/GrabberConfig/naiz.eus.txt new file mode 100644 index 00000000..9e48333a --- /dev/null +++ b/data/GrabberConfig/naiz.eus.txt @@ -0,0 +1,4 @@ +body: //div[contains(@class,'widget full_article')] + +test_url: http://www.naiz.eus/eu/actualidad/noticia/20151002/adegi-afirma-que-los-jovenes-viven-una-vida-muy-comoda-y-no-tienen-hambre-para-emprender# +test_url: http://www.naiz.eus/eu/actualidad/noticia/20151012/podemos-euskadi-critica-que-otegi-y-sus-companeros-quieran-pasar-por-grandes-pacifistas diff --git a/data/GrabberConfig/nakedsecurity.sophos.com.txt b/data/GrabberConfig/nakedsecurity.sophos.com.txt new file mode 100644 index 00000000..57bb2b48 --- /dev/null +++ b/data/GrabberConfig/nakedsecurity.sophos.com.txt @@ -0,0 +1,5 @@ +body: //div[@class='entry-content'] +title: //div[@class='entry-title'] +strip: //div[@class='entry-sharing'] + +test_url: https://nakedsecurity.sophos.com/2016/04/04/new-ransomware-with-an-old-trick-petya-parties-like-its-1989/ diff --git a/data/GrabberConfig/nasa.gov.txt b/data/GrabberConfig/nasa.gov.txt new file mode 100644 index 00000000..7df1112b --- /dev/null +++ b/data/GrabberConfig/nasa.gov.txt @@ -0,0 +1,8 @@ +title: //div[@class='address']/span +author: substring-before(//span[@class='credits'],',') +date: //div[@class='promodatepress']/span +body: //div[@class='default_style_wrap'] +strip: //div[@class='text_adjust'] +strip: //div[@class='skiplink'] +strip: //h2 +test_url: http://www.nasa.gov/mission_pages/kepler/news/kepler-21b.html
\ No newline at end of file diff --git a/data/GrabberConfig/natura-sciences.com.txt b/data/GrabberConfig/natura-sciences.com.txt new file mode 100644 index 00000000..5f816f7e --- /dev/null +++ b/data/GrabberConfig/natura-sciences.com.txt @@ -0,0 +1,13 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] + +author: substring-before(//div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]//strong[text()='Auteur :']/following-sibling::em, ',') + +strip_id_or_class: sharify-container + +# strip the "read also" paragraph about related article: +strip: //strong[text()='Lire aussi']/following-sibling::a[contains(@href, '//www.natura-sciences.com/')]/parent::p + +test_url: http://www.natura-sciences.com/agriculture/miscanthus-huile-de-ricin-biosource.html +test_url: http://www.natura-sciences.com/environnement/montagne-dor-guyane-projet-minier.html + diff --git a/data/GrabberConfig/nature.com.txt b/data/GrabberConfig/nature.com.txt new file mode 100644 index 00000000..5a91504c --- /dev/null +++ b/data/GrabberConfig/nature.com.txt @@ -0,0 +1,2 @@ +body: //div[@id='article'] +test_url: https://www.nature.com/npp/journal/v42/n11/full/npp201786a.html
\ No newline at end of file diff --git a/data/GrabberConfig/nbnnews.com.au.txt b/data/GrabberConfig/nbnnews.com.au.txt new file mode 100644 index 00000000..a2409878 --- /dev/null +++ b/data/GrabberConfig/nbnnews.com.au.txt @@ -0,0 +1,3 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ') or contains(@class, 'single-post-thumb')] +test_url: http://www.nbnnews.com.au/2015/03/24/lismore-man-will-attempt-to-run-around-australia/ +test_url: http://www.nbnnews.com.au/category/nthn-rivers-sport/feed/ diff --git a/data/GrabberConfig/ncbi.nlm.nih.gov.txt b/data/GrabberConfig/ncbi.nlm.nih.gov.txt new file mode 100644 index 00000000..2b8744c1 --- /dev/null +++ b/data/GrabberConfig/ncbi.nlm.nih.gov.txt @@ -0,0 +1,10 @@ +title: //div[contains(concat(' ',normalize-space(@class),' '),' rprt ')]//h1 +author: //div[contains(concat(' ',normalize-space(@class),' '),' auths ')] +date: //div[contains(concat(' ',normalize-space(@class),' '),' cit ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' abstr ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' icons ')] +strip: //div[@id='footer'] +strip: //div[contains(concat(' ',normalize-space(@class),' '),' header ')] +prune: yes +test_url: https://www.ncbi.nlm.nih.gov/pubmed/10356353?access_num=10356353&link_type=MED&dopt=Abstract + diff --git a/data/GrabberConfig/neh.gov.txt b/data/GrabberConfig/neh.gov.txt new file mode 100644 index 00000000..e7cc4313 --- /dev/null +++ b/data/GrabberConfig/neh.gov.txt @@ -0,0 +1,17 @@ +#host configuration should be http://www.neh.gov/news/humanities/ + + +#meta data +title:substring-after(substring-after(//title,':'),':') +author:substring-after(//h2[@class = 'subHead'],'By') +date:substring-before(substring-after(//title,':'),':') + +#img and caption handling +wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text() +wrap_in(fieldset)://div[@id = 'mainContent']/table + +# clean up +strip: //table[@class = 'marginpaddingTop'] +strip: //h2[@class = 'subHead'] + +test_url: http://www.neh.gov/news/humanities/2011-11/IslamicScholar.html
\ No newline at end of file diff --git a/data/GrabberConfig/net-security.org.txt b/data/GrabberConfig/net-security.org.txt new file mode 100644 index 00000000..b7fedbf3 --- /dev/null +++ b/data/GrabberConfig/net-security.org.txt @@ -0,0 +1,7 @@ +title: //div[@class='content-title'] +#date: substring-after(//div[@class='dernek-text-under'],'Posted on') +body: //div[@class='content-item'] +next_page_link: //li[@class='next']/a +convert_double_br_tags: yes + +test_url: http://www.net-security.org/article.php?id=1732
\ No newline at end of file diff --git a/data/GrabberConfig/netmagazine.com.txt b/data/GrabberConfig/netmagazine.com.txt new file mode 100644 index 00000000..dcea047c --- /dev/null +++ b/data/GrabberConfig/netmagazine.com.txt @@ -0,0 +1,16 @@ +title: //h1 +author: //div[@class="submitted"]/span + +# seems like this should work, but nothing is returned. Issue with xpath parser? +date: //div[@class="submitted"]/time + +body: //div[@id="main-content"] + +strip_comments: no + +strip: //h1 +strip: //div[@class="submitted"] +strip: //dd[@class="profile-avatar"] +strip: //div[@class="author-profile"]/dl/dt[1] +strip: //div[@id="right-col"] +test_url: http://www.netmagazine.com/opinions/nielsen-wrong-mobile
\ No newline at end of file diff --git a/data/GrabberConfig/networkworld.com.txt b/data/GrabberConfig/networkworld.com.txt new file mode 100644 index 00000000..f097298b --- /dev/null +++ b/data/GrabberConfig/networkworld.com.txt @@ -0,0 +1,26 @@ +# All sites of the IDG network can be extracted using the same rules, +# make sure to update all of them + +author: //meta[@name="author"]/@content +date: //meta[@name="DC.date.issued"]/@content + +body: //div[@itemprop="articleBody"] +body: //div[@itemprop="reviewBody"] +body: //figcaption|//div[@class="img-wrapper"]/noscript/img + +next_page_link: //a[@rel="next"] + +strip: //aside +strip: //h3[contains(., "See also:")] +strip: //div[@id="article-top-page-number"] +strip: //p[starts-with(normalize-space(.), '[')] +strip: //p[starts-with(normalize-space(.), '+')] + +test_url: http://www.networkworld.com/index.rss +test_url: http://www.networkworld.com/article/2952852/email-services/4-great-gmail-tips-you-might-not-know-about.html#tk.rss_all +test_url: http://www.networkworld.com/article/2951819/infrastructure-management/cios-say-applecare-for-enterprise-is-lacking.html +test_url: http://www.networkworld.com/article/2951823/cisco-subnet/early-access-qa-new-cisco-ceo-chuck-robbins-heads-into-hyper-connected-mode.html +test_url: http://www.networkworld.com/article/2947478/mobile/what-if-the-apple-watch-really-is-a-flop.html +test_url: http://www.networkworld.com/article/2953522/cloud-computing/nfv-a-hit-for-opendaylight.html +test_url: http://www.networkworld.com/article/2960522/wi-fi/best-wi-fi-stumblers-for-the-mac.html +test_url: http://www.networkworld.com/article/2969669/email-services/outlook-2016-review-a-new-coat-of-paint-on-the-same-reliable-personal-information-manager.html diff --git a/data/GrabberConfig/netzpolitik.org.txt b/data/GrabberConfig/netzpolitik.org.txt new file mode 100644 index 00000000..a4ab51cd --- /dev/null +++ b/data/GrabberConfig/netzpolitik.org.txt @@ -0,0 +1,7 @@ +title: //h1[@class='entry-title'] +author: //a[@ref='author'] +date: //span[@class='entry-date'] +body: //div[@class='entry-content'] +strip_id_or_class: netzpolitik-cta + +test_url: http://netzpolitik.org/2011/buch-generation-facebook/ diff --git a/data/GrabberConfig/neues-deutschland.de.txt b/data/GrabberConfig/neues-deutschland.de.txt new file mode 100644 index 00000000..d128638b --- /dev/null +++ b/data/GrabberConfig/neues-deutschland.de.txt @@ -0,0 +1,9 @@ +title: //h1//span +date: //div[@class='Date'] + +body: //article//div[contains(concat(' ',normalize-space(@class),' '),' Content ')] + +strip_id_or_class: 'Content-Ad' +strip_id_or_class: 'ArticleShopAd' + +test_url: http://www.neues-deutschland.de/artikel/1009093.mutmassliche-rechtsterroristen-in-freital-festgenommen.html
\ No newline at end of file diff --git a/data/GrabberConfig/newleftproject.org.txt b/data/GrabberConfig/newleftproject.org.txt new file mode 100644 index 00000000..d9af99d8 --- /dev/null +++ b/data/GrabberConfig/newleftproject.org.txt @@ -0,0 +1,3 @@ +title: //div[contains(@class, 'article_header')]//h3 + +test_url: http://www.newleftproject.org/index.php/site/article_comments/do_we_need_a_facebook_of_the_left
\ No newline at end of file diff --git a/data/GrabberConfig/newrepublic.com.txt b/data/GrabberConfig/newrepublic.com.txt new file mode 100644 index 00000000..039f0385 --- /dev/null +++ b/data/GrabberConfig/newrepublic.com.txt @@ -0,0 +1,8 @@ +author: //span[@class="authors"] +date: //span[@class="date"] +body: //div[@class="primary"] + +strip: //div[@id="controls"] +strip: //div[@id="read-next"] + +test_url: http://www.newrepublic.com/article/112731/moocs-will-online-education-ruin-university-experience
\ No newline at end of file diff --git a/data/GrabberConfig/news-gazette.com.txt b/data/GrabberConfig/news-gazette.com.txt new file mode 100644 index 00000000..2b352707 --- /dev/null +++ b/data/GrabberConfig/news-gazette.com.txt @@ -0,0 +1,8 @@ +title: //div[@id="main-content"]//h2 + +author: //div[@id="main-content"]//span[@class="authors"] + +date: //div[@id="main-content"]//span[@class="timestamp"] + +body: //div[@id="main-content"]//div[@class="content"] +test_url: http://www.news-gazette.com/news/business/economy/2011-08-08/ibm-drops-out-blue-waters-project.html
\ No newline at end of file diff --git a/data/GrabberConfig/news.cnet.com.txt b/data/GrabberConfig/news.cnet.com.txt new file mode 100644 index 00000000..78af70f4 --- /dev/null +++ b/data/GrabberConfig/news.cnet.com.txt @@ -0,0 +1,12 @@ +#This should apply to *.cnet.com. Not just news.cnet.com. +title: //h1 +author: //img[@class="mugshot"]/@alt +strip: //h1 +strip_id_or_class: breadcrumb +strip: //p[@id="introP"] +strip: //div[@class="postByline"] +strip: //div[@class="editorBio"] +strip: //div[@class="inline-slideshow"] +strip: //div[@class="related"] +body: //div[@class="postBody txtWrap"] +test_url: http://news.cnet.com/8301-27076_3-57405303-248/apple-ipad-charging-fine-keep-it-plugged-in/?tag=mncol;posts
\ No newline at end of file diff --git a/data/GrabberConfig/news.com.au.txt b/data/GrabberConfig/news.com.au.txt new file mode 100644 index 00000000..7b35fab9 --- /dev/null +++ b/data/GrabberConfig/news.com.au.txt @@ -0,0 +1,6 @@ +body: //div[@class='story-body'] +prune: no +tidy: no + +test_url: http://www.news.com.au/lifestyle/parenting/babies/dad-shamed-for-taking-baby-into-mackay-shopping-centre-parents-room/news-story/02e22f3df7be275f74825feb63302cc6 +test_contains: Members of the public have come out diff --git a/data/GrabberConfig/news.detik.com.txt b/data/GrabberConfig/news.detik.com.txt new file mode 100644 index 00000000..629bc917 --- /dev/null +++ b/data/GrabberConfig/news.detik.com.txt @@ -0,0 +1,8 @@ +title://div[@class="content_detail"]/h1 + +author://div[@class="author"]/strong + +date:substring-before(substring-after(//div[@class="content_detail"]/span[@class="date"], ','), ' WIB') + +body://div[@class="text_detail"] +test_url: http://news.detik.com/read/2012/05/22/225531/1922307/10/menkeu-cek-soal-lolosnya-315-kg-sabu-dari-bea-cukai
\ No newline at end of file diff --git a/data/GrabberConfig/news.mynavi.jp.txt b/data/GrabberConfig/news.mynavi.jp.txt new file mode 100644 index 00000000..1df47314 --- /dev/null +++ b/data/GrabberConfig/news.mynavi.jp.txt @@ -0,0 +1,11 @@ +title: //h2[@class="lyt-hdg-02-04"] + +author: //div[@class="lyt-namearea"]/a + +date: //div[@class="lyt-namearea"]/text() + +body: //div[@class="articleContent"] + +strip: //div[@id="tab-aside"] + +test_url: http://news.mynavi.jp/articles/2011/12/07/nico/index.html
\ No newline at end of file diff --git a/data/GrabberConfig/news.pixelistes.com.txt b/data/GrabberConfig/news.pixelistes.com.txt new file mode 100644 index 00000000..b86b866b --- /dev/null +++ b/data/GrabberConfig/news.pixelistes.com.txt @@ -0,0 +1,12 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http://www.journaldugeek.com/2015/09/09/apple-ipad-pro/ + +date: //meta[@property="og:updated_time"]/@content +next_page_link: //div[@class="post-content"]/div[@class='row pagination']/a[contains(concat(' ',normalize-space(@class),' '),' next ')] + +strip_id_or_class: jdg-recommend +strip_id_or_class: proofreader-bloc + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] +test_url: http://news.pixelistes.com/pixelistes-partenaire-du-salon-de-la-photo-de-paris/ diff --git a/data/GrabberConfig/news.rambler.ru.txt b/data/GrabberConfig/news.rambler.ru.txt new file mode 100644 index 00000000..1d547334 --- /dev/null +++ b/data/GrabberConfig/news.rambler.ru.txt @@ -0,0 +1,9 @@ +body: //article +title: //h1 +author: //span[@class='b-article-source-dropdown'] +strip: //span[@class='b-article-photo-incut__source'] +strip: //a[@class='b-read-more b-read-more_bottom'] + + +tidy:no +test_url: http://news.rambler.ru/12972208/
\ No newline at end of file diff --git a/data/GrabberConfig/news.techmeme.com.txt b/data/GrabberConfig/news.techmeme.com.txt new file mode 100644 index 00000000..ba4db828 --- /dev/null +++ b/data/GrabberConfig/news.techmeme.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='main']/div[@class='item'] +strip: //div[@class='right'] + +test_url: http://news.techmeme.com/110516/fh-rip
\ No newline at end of file diff --git a/data/GrabberConfig/news.yahoo.com.txt b/data/GrabberConfig/news.yahoo.com.txt new file mode 100644 index 00000000..fc1739c8 --- /dev/null +++ b/data/GrabberConfig/news.yahoo.com.txt @@ -0,0 +1,12 @@ +title: //meta[@property='og:title']/@content +title: //h1[@class='headline'] +author: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//span[@class='fn'] +date: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//abbr/@title +body: //div[@id='mediaarticlelead']//a[@class='media'] | //div[contains(@class,'yom-art-content')] +#strip: //cite/abbr +strip_id_or_class: action +strip_id_or_class: prefetch +tidy: no +prune: no + +test_url: http://news.yahoo.com/cold-la-nina-winter-forecast-west-coast-183535067.html
\ No newline at end of file diff --git a/data/GrabberConfig/news.ycombinator.com.txt b/data/GrabberConfig/news.ycombinator.com.txt new file mode 100644 index 00000000..f7441d17 --- /dev/null +++ b/data/GrabberConfig/news.ycombinator.com.txt @@ -0,0 +1,3 @@ +strip_comments: no +strip: //a[. = 'reply'] +test_url: http://news.ycombinator.com/item?id=1516461
\ No newline at end of file diff --git a/data/GrabberConfig/news247.gr.txt b/data/GrabberConfig/news247.gr.txt new file mode 100644 index 00000000..87637bed --- /dev/null +++ b/data/GrabberConfig/news247.gr.txt @@ -0,0 +1,6 @@ +title: //h1[@class='title'] + +body: //img[@id='relPicsMainPic'] | //div[contains(@class, 'storyContent')] + +test_url: http://news247.gr/eidiseis/katatheseis_fwtia_htan_apofasismenoi_akomh_kai_na_afairesoyn_zwes_an_thewrousan_oti_to_thuma_htan_antipalos_toys.2433351.html +test_url: http://news247.gr/?widget=rssfeed&view=feed&contentId=38291
\ No newline at end of file diff --git a/data/GrabberConfig/newsbomb.gr.txt b/data/GrabberConfig/newsbomb.gr.txt new file mode 100644 index 00000000..5eb0ea46 --- /dev/null +++ b/data/GrabberConfig/newsbomb.gr.txt @@ -0,0 +1,9 @@ +date: //meta[@name='og:article:published_time']/@value + +body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] + +strip_id_or_class: itemImageGallery + +prune: no + +test_url: http://www.newsbomb.gr/gossip/story/257234/i-proin-moy-protimoyse-na-serfarei-apo-to-na-kanoyme-sex
\ No newline at end of file diff --git a/data/GrabberConfig/newstatesman.com.txt b/data/GrabberConfig/newstatesman.com.txt new file mode 100644 index 00000000..6c133aa6 --- /dev/null +++ b/data/GrabberConfig/newstatesman.com.txt @@ -0,0 +1,13 @@ +body: //div[(contains(@class,'content')) and (contains(@class,'node-blogs'))] +body: //div[contains(@class,'field-item')] + +date: //div[@class='article-date'] + +author: //div[@class='author-byline']/a + +strip: //ul[@class='flippy'] +strip: //div[contains(@class,'first-appeared-in')] + +test_url: https://www.newstatesman.com/world/2018/05/leader-how-italian-election-result-exposed-europe-s-fraying-union +test_url: https://www.newstatesman.com/culture/books/2018/05/two-faces-philip-roth +test_url: https://www.newstatesman.com/books/2011/02/montaigne-life-playing diff --git a/data/GrabberConfig/newsunspun.org.txt b/data/GrabberConfig/newsunspun.org.txt new file mode 100644 index 00000000..247bbebb --- /dev/null +++ b/data/GrabberConfig/newsunspun.org.txt @@ -0,0 +1,10 @@ +body: //div[@class='right']//div[@class='articles'] +author: //div[@id='artinfo']//a[contains(@href, '/author/')] +strip: //div[@id='artinfo'] +strip: //table[//a[contains(@href, 'twitter.com')]] +strip_id_or_class: twitter + +prune: no +tidy: no + +test_url: http://www.newsunspun.org/eotn/bbc-headline-change-iran-goes-from-not-building-to-undecided-on-nuclear-bomb
\ No newline at end of file diff --git a/data/GrabberConfig/newsweek.com.txt b/data/GrabberConfig/newsweek.com.txt new file mode 100644 index 00000000..565648ba --- /dev/null +++ b/data/GrabberConfig/newsweek.com.txt @@ -0,0 +1,6 @@ +body: //div[@class = 'article-body'] +title: //h1[@class = 'article-title'] +strip: //aside + +test_url: http://www.newsweek.com/day-steve-mcqueen-met-his-new-nazi-neighbor-keith-moon-229741 +test_url: http://www.newsweek.com/2014/06/13/how-greylock-partners-finds-next-facebook-253329.html diff --git a/data/GrabberConfig/newswise.com.txt b/data/GrabberConfig/newswise.com.txt new file mode 100644 index 00000000..10120ea1 --- /dev/null +++ b/data/GrabberConfig/newswise.com.txt @@ -0,0 +1,17 @@ +prune: no +tidy: no + +title: //h1/a[2] +body: //div[@id="main"] +author: //span[@id="articlesource"] +date: //span[contains(@class, 'releasedate')] + +strip: //div[@class="inst-logo"] +strip: //h1[1] + +strip_id_or_class: addthis +strip_id_or_class: released +strip_id_or_class: skiptranslate +strip_id_or_class: flash + +test_url: http://www.newswise.com/articles/first-heat-wave-of-season-puts-elderly-at-risk diff --git a/data/GrabberConfig/newyorker.com.txt b/data/GrabberConfig/newyorker.com.txt new file mode 100644 index 00000000..c550a4e6 --- /dev/null +++ b/data/GrabberConfig/newyorker.com.txt @@ -0,0 +1,10 @@ +title: //h1[@id='articlehed'] | //h2[@id="articleintro"] +body: //article//div[@id='articleBody'] + +author: //header//p[contains(@class, 'Byline')]//a + +date: //meta[@name='pubdate']/@value + +single_page_link: //div[@class='paginationViewSinglePage']/a +test_url: http://www.newyorker.com/online/blogs/culture/2012/06/mug-shot-web-sites.html +test_url: http://www.newyorker.com/reporting/2013/04/22/130422fa_fact_bilger?currentPage=all&mobify=0
\ No newline at end of file diff --git a/data/GrabberConfig/nextcloud.com.txt b/data/GrabberConfig/nextcloud.com.txt new file mode 100644 index 00000000..26898a65 --- /dev/null +++ b/data/GrabberConfig/nextcloud.com.txt @@ -0,0 +1,3 @@ +date: //time/@datetime + +test_url: https://nextcloud.com/blog/celebrating-2-years-nextcloud/ diff --git a/data/GrabberConfig/nfl.com.txt b/data/GrabberConfig/nfl.com.txt new file mode 100644 index 00000000..956b288f --- /dev/null +++ b/data/GrabberConfig/nfl.com.txt @@ -0,0 +1,11 @@ +# doesn't look like selecting an attribute value works? +# author: //meta[@id="authorName"]@value + +author: substring-after(//li[@id="article-hdr-meta-author"]/text(), "By ") +date: //abbr[@id="article-time"] +title: //div[@id="article-hdr"]/h1 +body: //div[@class="articleText"] + +# strip miscellaneous teasers & etc +strip: //div[@class="removeformobile"] +test_url: http://www.nfl.com/news/story/09000d5d82388707/article/close-shave-chiefs-haley-perseveres-through-rough-start?module=HP11_content_stream
\ No newline at end of file diff --git a/data/GrabberConfig/ngm.nationalgeographic.com.txt b/data/GrabberConfig/ngm.nationalgeographic.com.txt new file mode 100644 index 00000000..44a82a95 --- /dev/null +++ b/data/GrabberConfig/ngm.nationalgeographic.com.txt @@ -0,0 +1,7 @@ +next_page_link: //div[@class='nextpage_continue']/a +strip: //div[@class='nextpage_continue'] +strip_id_or_class: nextpage +title: //div[@class='article_title']//h1 +body: //div[@class='article_title']/.. +body: //div[@class='content'] +test_url: http://ngm.nationalgeographic.com/2012/02/tsunami/folger-text
\ No newline at end of file diff --git a/data/GrabberConfig/nintendoworldreport.com.txt b/data/GrabberConfig/nintendoworldreport.com.txt new file mode 100644 index 00000000..f0e28afb --- /dev/null +++ b/data/GrabberConfig/nintendoworldreport.com.txt @@ -0,0 +1,13 @@ +body: //div[@id="main"] +title: //div[@id="main"]/h3 + +# Remove ‘Review’ and ‘Wii’. +strip: //div[@class="badge"] + +# Remove duplicate title and country flag. +strip: //h3 + +# Commented out below are attempts to extract the author and date, which did not work. +# author: //p[@class="extra "]/a +# date: //p[@class="extra "]/span[@class="when"] +test_url: http://www.nintendoworldreport.com/review/28400
\ No newline at end of file diff --git a/data/GrabberConfig/nj.com.txt b/data/GrabberConfig/nj.com.txt new file mode 100644 index 00000000..4d69c431 --- /dev/null +++ b/data/GrabberConfig/nj.com.txt @@ -0,0 +1,4 @@ +body: //div[@id='article_container'] +strip_id_or_class: social + +test_url: http://www.nj.com/eagles/index.ssf/2018/04/should_eagles_trade_brandon_graham_observations.html#incart_river_index diff --git a/data/GrabberConfig/nojesguiden.se.txt b/data/GrabberConfig/nojesguiden.se.txt new file mode 100644 index 00000000..b15f0612 --- /dev/null +++ b/data/GrabberConfig/nojesguiden.se.txt @@ -0,0 +1,5 @@ +author: //span[@class='meta']/span[@class='username'] +body: //div[@class='article-content'] + +strip_id_or_class: 'article-actions' +test_url: http://nojesguiden.se/blogg/maja-bredberg/maja-laser-tidningen-en-helt-vanlig-lordag-i
\ No newline at end of file diff --git a/data/GrabberConfig/northumberlandview.ca.txt b/data/GrabberConfig/northumberlandview.ca.txt new file mode 100644 index 00000000..f698d98e --- /dev/null +++ b/data/GrabberConfig/northumberlandview.ca.txt @@ -0,0 +1,11 @@ +title: //h1 +body: //div[@id='pn-maincontent'] +strip_id_or_class: z-menu +strip_id_or_class: news_category +strip_id_or_class: news_title +strip_id_or_class: news_modify +strip_id_or_class: news_morearticlesincat +strip_id_or_class: ezc_comments +strip_comments: yes + +test_url: http://www.northumberlandview.ca/index.php?module=news&type=user&func=display&sid=31127 diff --git a/data/GrabberConfig/nosalty.hu.txt b/data/GrabberConfig/nosalty.hu.txt new file mode 100644 index 00000000..7e20cadf --- /dev/null +++ b/data/GrabberConfig/nosalty.hu.txt @@ -0,0 +1,6 @@ +title: //div[@id='tab-recept']//h1 +body: //div[@id='tab-recept']//div[contains(@class, 'column-container')] +strip_id_or_class: ajanlo-box +prune: no + +test_url: http://www.nosalty.hu/recept/szupergyors-fank
\ No newline at end of file diff --git a/data/GrabberConfig/nota-bene.org.txt b/data/GrabberConfig/nota-bene.org.txt new file mode 100644 index 00000000..7651dcab --- /dev/null +++ b/data/GrabberConfig/nota-bene.org.txt @@ -0,0 +1,8 @@ +body://section[@class='descriptif'] | //section[contains(@class, 'texte')] | //section[@class='notes'] + +# Last edition date (if any) +date: //time[@itemprop='dateModified']/@datetime +# Publication date +date: //time[@itemprop='datePublished']/@datetime + +test_url: http://nota-bene.org/Une-situation-qui-AMPire diff --git a/data/GrabberConfig/notebookcheck.net.txt b/data/GrabberConfig/notebookcheck.net.txt new file mode 100644 index 00000000..1e9a4b13 --- /dev/null +++ b/data/GrabberConfig/notebookcheck.net.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.notebookcheck.net%2FLenovo-ThinkPad-X1-Carbon-Ultrabook-Review.138033.0.html + +body: //div[@id='nbc_maincontent'] +test_url: http://www.notebookcheck.net/Lenovo-ThinkPad-X1-Carbon-Ultrabook-Review.138033.0.html diff --git a/data/GrabberConfig/nothingbutthewax.com.txt b/data/GrabberConfig/nothingbutthewax.com.txt new file mode 100644 index 00000000..ae2a1649 --- /dev/null +++ b/data/GrabberConfig/nothingbutthewax.com.txt @@ -0,0 +1,16 @@ + +author: //div[contains(concat(' ',normalize-space(@class),' '),' titrearticle ')]/h4/a + +body: //div[contains(concat(' ',normalize-space(@class),' '),' module-video ')] | //div[contains(concat(' ',normalize-space(@class),' '),' module-text ')] + +strip_id_or_class: titrearticle +strip_id_or_class: nc_socialPanel + +# strip everything following the <h5>TOP</h5> located at the bottom of the article: +strip: //div[contains(concat(' ',normalize-space(@class),' '),' top ')]/h5[text()='TOP']/ancestor::*/following-sibling::* +strip: //div[contains(concat(' ',normalize-space(@class),' '),' top ')]/h5[text()='TOP']/parent::div + +# text article example: +test_url: http://nothingbutthewax.com/societe-2/assa-traore-une-antigone-contemporaine/7266/ +# video article example: +test_url: http://nothingbutthewax.com/video/inside-african-fashion/il-etait-une-fois-le-khanga/6901/ diff --git a/data/GrabberConfig/novastan.org.txt b/data/GrabberConfig/novastan.org.txt new file mode 100644 index 00000000..ee3f22ee --- /dev/null +++ b/data/GrabberConfig/novastan.org.txt @@ -0,0 +1,15 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' mkd-post-text ')] + +author: //h6[contains(concat(' ',normalize-space(@class),' '),' author ')]/a +author: //article[contains(concat(' ',normalize-space(@class),' '),' post ')]//a[@itemprop='author'] + +prune: no + +strip_id_or_class: entry-title +strip_id_or_class: simplefavorite-button +strip_id_or_class: mkd-blog-single-share +strip_id_or_class: mkd-single-tags-holder +strip: //p//a[@href='https://www.facebook.com/Novastan.org/']/ancestor::p[1] + +test_url: https://www.novastan.org/fr/kirghizstan/les-secrets-de-fabrication-du-feutre-kirghiz/ diff --git a/data/GrabberConfig/nplusonemag.com.txt b/data/GrabberConfig/nplusonemag.com.txt new file mode 100644 index 00000000..1b817c04 --- /dev/null +++ b/data/GrabberConfig/nplusonemag.com.txt @@ -0,0 +1,6 @@ +title: /html/body/div[3]/div/div/h1 + +body: //*[@id="article-body"] + + +test_url: http://nplusonemag.com/the-outskirts-of-progress
\ No newline at end of file diff --git a/data/GrabberConfig/npr.org.txt b/data/GrabberConfig/npr.org.txt new file mode 100644 index 00000000..8145eaf5 --- /dev/null +++ b/data/GrabberConfig/npr.org.txt @@ -0,0 +1,37 @@ +title: //div[contains(@class, 'storytitle')]//h1 +author: //p[@class="byline"]/span +body: //div[@id='primaryaudio']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext' or @id='supplementarycontent' or contains(@class, 'transcript')] +date: //meta[@name="date"]/@content + +strip_id_or_class: enlarge_measure +strip_id_or_class: enlarge_html +strip: //a[contains(@class, 'enlargeicon')] +strip: //div[contains(@class, 'bookedition')] +strip: //div[@class='textsize'] +strip: //ul[@class='genres'] +strip: //span[@class='bull'] +strip_id_or_class: secondary +strip_id_or_class: con1col +strip: //h3[@class='conheader'] + +#GDPR bypass +http_header(Cookie): trackingChoice=true; choiceVersion=1 + +replace_string(<a name="more"> </a>): <!-- no more --> +replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2> +replace_string(<div class="transcript storytext">): <div class="transcript storytext"><h2>Transcript</h2> + +prune: no +strip://div[@class="ecommercepop"] +strip://span[@class="bull"] +strip://span[@class="purchaseLink"] +strip://div[@class="enlarge_html"] +strip://div[@class="enlarge_measure"] +strip://div[@class="container con1col small"] +strip://a[contains(@class, "enlargebtn")] +strip://div[contains(@class, "bucketwrap internallink")] + +test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates +test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right +test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres +test_url: http://www.npr.org/templates/story/story.php?storyId=229103221 diff --git a/data/GrabberConfig/nrc.nl.txt b/data/GrabberConfig/nrc.nl.txt new file mode 100644 index 00000000..24f8a038 --- /dev/null +++ b/data/GrabberConfig/nrc.nl.txt @@ -0,0 +1,5 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' nmt-layout--sidebar-align-right ')] +strip_id_or_class: article__footer +strip_id_or_class: nmt-layout__sidebar + +test_url: http://www.nrc.nl/nieuws/2016/10/02/de-nederlandse-school-wanorde-onrust-en-lawaai-4566603-a1524441 diff --git a/data/GrabberConfig/ntoskrnl.org.txt b/data/GrabberConfig/ntoskrnl.org.txt new file mode 100644 index 00000000..f8daba08 --- /dev/null +++ b/data/GrabberConfig/ntoskrnl.org.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fntoskrnl.org%2F + +title: //h1 +body: //div[@id='content'] +test_url: http://ntoskrnl.org/ diff --git a/data/GrabberConfig/numerama.com.txt b/data/GrabberConfig/numerama.com.txt new file mode 100644 index 00000000..f10a698b --- /dev/null +++ b/data/GrabberConfig/numerama.com.txt @@ -0,0 +1,11 @@ +# Need html5lib or replace_string to handle correctly badly included in-content-ad inclusion script. html5lib is a lot slower. +# parser: html5lib +replace_string("</div>"): "</div>" +body: //article[@class='post-content'] +strip: //span[@class='summary-entry'] +strip: //footer + +test_url: http://www.numerama.com/sciences/243352-hubble-detecte-un-trou-noir-supermassif-propulse-hors-de-sa-galaxie.html +test_url: http://www.numerama.com/tech/242703-free-mobile-et-la-4g-en-illimite-ce-quil-faut-savoir.html +# Don't know why this one get everything in bold: +test_url: http://www.numerama.com/business/243686-comme-convenu-quand-lenfer-dune-startup-se-transforme-en-succes-dauto-edition.html diff --git a/data/GrabberConfig/nybooks.com.txt b/data/GrabberConfig/nybooks.com.txt new file mode 100644 index 00000000..d95ec68e --- /dev/null +++ b/data/GrabberConfig/nybooks.com.txt @@ -0,0 +1,13 @@ +strip_id_or_class: sIFR-alternate +title: //div[@id='page-title-wrapper']/div[@id='page-title']/h2 +single_page_link: //a[contains(@href, 'pagination=false') and not(contains(@href, 'printpage=true'))] + +body: //div[@id = 'article-body'] +strip_id_or_class:article-tools +strip_id_or_class:js_target +strip_id_or_class:marker +author://div[@id = 'page-title']/h3 +date://div[@id = 'page-title']/h5/a[starts-with(@href,'/issues/')] + + +test_url: http://www.nybooks.com/articles/archives/2012/feb/23/were-more-unequal-you-think/
\ No newline at end of file diff --git a/data/GrabberConfig/nymag.com.txt b/data/GrabberConfig/nymag.com.txt new file mode 100644 index 00000000..c2cb435c --- /dev/null +++ b/data/GrabberConfig/nymag.com.txt @@ -0,0 +1,19 @@ +title: //h2[contains(@class, 'primary')] +body: //*[@itemprop="articleBody"] +body: //div[@id='story'] +author: //*[@class='by']/a +date: substring-after(//*[@class='date'], 'Published') + +#Skip GDPR warning +http_header(Cookie): nymuc=11111111111 + +parser: html5php +tidy: no + +next_page_link: //div[@class='page-navigation']//li[@class='next']/a + +test_url: http://nymag.com/news/features/wall-street-2012-2/ +test_contains: bonus season is a sacred ritual + +test_url: http://nymag.com/daily/intelligencer/2016/04/america-tyranny-donald-trump.html +test_contains: This rainbow-flag polity diff --git a/data/GrabberConfig/nyteknik.se.txt b/data/GrabberConfig/nyteknik.se.txt new file mode 100644 index 00000000..f4bedb6a --- /dev/null +++ b/data/GrabberConfig/nyteknik.se.txt @@ -0,0 +1,8 @@ +title: //div[@class="article default-article"]/h1 +author: //p[@class="author"]/a[2] + +# Article introduction: +#move_into(//div[@class="article-bread"]): //p[@class="lead"] + +body: //div[@class="article-bread"] +test_url: http://www.nyteknik.se/nyheter/energi_miljo/energi/article3391426.ece
\ No newline at end of file diff --git a/data/GrabberConfig/nytimes.com.txt b/data/GrabberConfig/nytimes.com.txt new file mode 100644 index 00000000..08d7e0bd --- /dev/null +++ b/data/GrabberConfig/nytimes.com.txt @@ -0,0 +1,71 @@ +# mobile.nytimes.com appears to be the same as www.nytimes.com now, +# so any changes here should probably also be made to mobile.nytimes.com.txt too + +title://h1[@class="articleHeadline"] +body: //div[contains(concat(' ',normalize-space(@class),' '),' story-body ')] +body://div[@id="article"] +body://*[@itemprop="articleBody"] +body: //div[contains(concat(' ',normalize-space(@class),' '),' g-body-article-container ')] +body: //article[@id='story'] +strip_id_or_class:articleTools +strip_id_or_class:readerscomment +#strip://div[contains(@class, "articleInline runaroundLeft")] +strip: //div[contains(@class, "doubleRule")] +# strip image credit - appears as a bold heading +strip: //div[contains(@class, "articleInline")]//h6 +strip_id_or_class:enlargeThis +strip_id_or_class:pageLinks +strip_id_or_class:memberTools +strip_id_or_class:articleExtras +strip_id_or_class:singleAd +strip_id_or_class:byline +strip_id_or_class:dateline +strip_id_or_class:articleheadline +strip_id_or_class:articleBottomExtra +strip_id_or_class:shareTools +strip_id_or_class:story-meta +strip_id_or_class:related-coverage +strip_id_or_class:ad-header +strip_id_or_class:bottom-ad +strip_id_or_class:advert_item +strip://a[contains(@href, 'nytimes.com/adx/')] +strip: //nyt_byline +strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')] +strip: //p[@class='caption']//a[contains(., 'More Photos')] +strip_id_or_class: ResponsiveAd + +strip_id_or_class: robots-nocontent +strip_id_or_class: hidden + +prune: no +tidy: no + +date: //meta[@property="article:published"]/@content +date: //meta[@itemprop="datePublished"]/@content + +find_string: src='https://static01.nyt.com/packages/flash/multimedia/ICONS/transparent.png +replace_string: ignore-src='https://static01.nyt.com/packages/flash/multimedia/ICONS/transparent.png +find_string: data-mediaviewer-src='https://static01.nyt.com +replace_string: src='https://static01.nyt.com + +single_page_link: //link[contains(@href, 'pagewanted=all')] +#mobile.nytimes.com looks same as regular www.nytimes.com now +#single_page_link: //link[@rel='alternate' and contains(@href, 'mobile.nytimes.com')]/@href +#single_page_link: concat(substring-before(//div[@id='pageLinks']//a[contains(@href, 'pagewanted=')]/@href, 'pagewanted='), 'pagewanted=all') + +strip://h6[@class = 'kicker'] + +test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html +test_contains: In this column I want to look at a not uncommon way of writing + +test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html +test_contains: IF you’ve seen enough of Aaron Sorkin’s theater + +test_url: https://www.nytimes.com/interactive/2016/books/review/best-books.html +test_contains: invention and speculation flow together + +test_url: http://www.nytimes.com/2013/03/25/world/middleeast/israeli-military-responds-after-patrols-come-under-fire-from-syria.html +test_url: http://www.nytimes.com/2013/08/15/nyregion/when-the-new-york-city-subway-ran-without-rails.html +test_url: http://www.nytimes.com/2004/02/29/weekinreview/correspondence-class-consciousness-china-s-wealthy-live-creed-hobbes-darwin-meet.html +test_url: http://www.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html +test_url: https://www.nytimes.com/interactive/2015/12/16/upshot/100000004092329.app.html?_r=2 diff --git a/data/GrabberConfig/nzz.ch.txt b/data/GrabberConfig/nzz.ch.txt new file mode 100644 index 00000000..749f4f2a --- /dev/null +++ b/data/GrabberConfig/nzz.ch.txt @@ -0,0 +1,12 @@ +body: //*[@class='article-full'] +title: //h3 +strip: //header[@class='group'] +#body: //p[@class='lead'] +#move_into(//p[@class='lead']): //*[@class='article-full']/figure +#move_into(//p[@class='lead']): //div[@id='articleBodyText'] +strip: //div[@id='social-media-floater'] +strip: //div[@class='advertisement'] +strip: //div[@class='infobox'] +strip: //div[@id='articleComments'] + +test_url: http://www.nzz.ch/wissen/wissenschaft/sonnenschutz-fuer-die-erde-1.17282213
\ No newline at end of file diff --git a/data/GrabberConfig/o6asan.com.txt b/data/GrabberConfig/o6asan.com.txt new file mode 100644 index 00000000..711a23ee --- /dev/null +++ b/data/GrabberConfig/o6asan.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fo6asan.com%2Fblog-e%2F2016%2F03%2F14%2Fhow-to-install-a-lets-encrypt-certificate-supports-sans-to-apache-on-windows%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: https://o6asan.com/blog-e/2016/03/14/how-to-install-a-lets-encrypt-certificate-supports-sans-to-apache-on-windows/ diff --git a/data/GrabberConfig/observers.france24.com.txt b/data/GrabberConfig/observers.france24.com.txt new file mode 100644 index 00000000..75856c2d --- /dev/null +++ b/data/GrabberConfig/observers.france24.com.txt @@ -0,0 +1,6 @@ +date: //span[@class="date"] +title: //div[contains(concat(' ',normalize-space(@class),' '),' news-meta ')]//h3 +body: //div[contains(concat(' ',normalize-space(@class),' '),' postContent ')] +author: //div[contains(concat(' ',normalize-space(@class),' '),' authors-infos ')]//h4[contains(concat(' ',normalize-space(@class),' '),' media-heading ') and (contains(concat(' ',normalize-space(@class),' '),' userName '))] +strip_id_or_class: observateurBandeau +test_url: http://observers.france24.com/fr/20170407-begpackers-ces-touristes-blancs-mendiant-manche-tour-monde-asie-backpackers diff --git a/data/GrabberConfig/off.net.mk.txt b/data/GrabberConfig/off.net.mk.txt new file mode 100644 index 00000000..bf107876 --- /dev/null +++ b/data/GrabberConfig/off.net.mk.txt @@ -0,0 +1,7 @@ +body: //div[(@id = "content")] +strip: //div[(@class = "links-bar")] +strip: //div[(@class = "povrzani")] +strip: //div[(@class = "povrzani-dolu")] +strip: //div[(@class = "tags")] +strip: //h1[(@id = "page-title")] +test_url: http://off.net.mk/zhivot-i-zabava/gadzheti/dzhabe-raboti-dzhabe-ne-dishi
\ No newline at end of file diff --git a/data/GrabberConfig/omgubuntu.co.uk.txt b/data/GrabberConfig/omgubuntu.co.uk.txt new file mode 100644 index 00000000..466ff8f4 --- /dev/null +++ b/data/GrabberConfig/omgubuntu.co.uk.txt @@ -0,0 +1,13 @@ +tidy: no +prune: no + +title: //h2[@class='entry-title'] +author: //a[@rel='author'] + +body: //div[@class='entry-content'] + +strip: //div[@class='entry-share'] +strip: //div[@class='entry-meta'] +strip: //div[@class='sharedaddy sd-sharing-enabled'] +strip: //div[@class='tag-links'] +test_url: http://www.omgubuntu.co.uk/2015/04/facebook-chat-api-empathy-pidgin-stop-working diff --git a/data/GrabberConfig/omiliya.org.txt b/data/GrabberConfig/omiliya.org.txt new file mode 100644 index 00000000..4b3a7202 --- /dev/null +++ b/data/GrabberConfig/omiliya.org.txt @@ -0,0 +1,9 @@ +title: //div[@id='squeeze']/h1 +strip: //div[@id='squeeze']/h1 +author: //div[@class='submitted']/a +strip: //div[@class='submitted']/a +convert_double_br_tags: yes + + + +test_url: http://omiliya.org/content/predchuvstvie.html
\ No newline at end of file diff --git a/data/GrabberConfig/oncletom.io.txt b/data/GrabberConfig/oncletom.io.txt new file mode 100644 index 00000000..b08f8b02 --- /dev/null +++ b/data/GrabberConfig/oncletom.io.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Foncletom.io%2Fnode.js%2Fchapter-01%2Findex.html + +body: //div[@id='content'] +test_url: https://oncletom.io/node.js/chapter-01/index.html diff --git a/data/GrabberConfig/onlinewelten.com.txt b/data/GrabberConfig/onlinewelten.com.txt new file mode 100644 index 00000000..1609fa83 --- /dev/null +++ b/data/GrabberConfig/onlinewelten.com.txt @@ -0,0 +1,2 @@ +body: //div[@id='news_detail']//div[@class='contents clearfix'] +test_url: http://www.onlinewelten.com/games/aliens-colonial-marines/news/offizielle-spiel-ankuendigung-nintendos-wii-u-103690/
\ No newline at end of file diff --git a/data/GrabberConfig/onstartups.com.txt b/data/GrabberConfig/onstartups.com.txt new file mode 100644 index 00000000..cccce8cd --- /dev/null +++ b/data/GrabberConfig/onstartups.com.txt @@ -0,0 +1,2 @@ +strip: //div[@id="dnn_LeftPane"] | //div[@id="dnn_ContentPane"]//h1 | //div[@id="dnn_ContentPane"]//p[@class="Normal"] | //div[@class="Submissions"] | //div[@id="listing"]//h3 | //div[@id="listing"][2] | //div[@id="emart-fail"] | //div[@id="emart-success"] | //div[@id="emart-form"] +test_url: http://onstartups.com/tabid/3339/bid/37737/Secrets-Of-Freemium-Pricing-Make-The-Cheapskates-Pay.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/ontologicalgeek.com.txt b/data/GrabberConfig/ontologicalgeek.com.txt new file mode 100644 index 00000000..a9bf71ef --- /dev/null +++ b/data/GrabberConfig/ontologicalgeek.com.txt @@ -0,0 +1,8 @@ +title: //h1[@class='entry-title'] + +author: //a[@rel='author'] + +date: substring-before(//aside[@class='entry-meta'], '|') + +body: //div[@class='entry-content'] +test_url: http://ontologicalgeek.com/change-or-live-final-fantasy-x-as-catholic-dystopia/
\ No newline at end of file diff --git a/data/GrabberConfig/opensource.org.txt b/data/GrabberConfig/opensource.org.txt new file mode 100644 index 00000000..2bd3ccdb --- /dev/null +++ b/data/GrabberConfig/opensource.org.txt @@ -0,0 +1,2 @@ +body: //div[@class='content clear-block'] +test_url: http://opensource.org/node/537
\ No newline at end of file diff --git a/data/GrabberConfig/openthemagazine.com.txt b/data/GrabberConfig/openthemagazine.com.txt new file mode 100644 index 00000000..6913eb0e --- /dev/null +++ b/data/GrabberConfig/openthemagazine.com.txt @@ -0,0 +1,4 @@ +body: //div[@id = 'content-inner'] +strip: //div[@id = 'content-bottom'] +strip_id_or_class: print_sharebutton +test_url: http://openthemagazine.com/article/nation/sania-vs-saina
\ No newline at end of file diff --git a/data/GrabberConfig/orf.at.txt b/data/GrabberConfig/orf.at.txt new file mode 100644 index 00000000..9eb29c3d --- /dev/null +++ b/data/GrabberConfig/orf.at.txt @@ -0,0 +1,32 @@ +single_page_link: //div[@id='ss-storyText']//p[@class='readMore']/a +single_page_link: //div[@id='ss-storyText']//p[contains(., 'Mehr dazu')]/a + +title: substring-before(//title,' - ') +body: //div[@id="ss-storyText"] +author: substring-before(//div[@id="ss-storyText"]//p[contains(text(), ', ORF.at')], ', ORF.at') +strip: //div[@id="ss-storyText"]//p[contains(text(), ', ORF.at')] +date: substring-after(//div[@class='storyMeta socialshare']//p[@class='date'],'Publiziert am') +strip: //p[@class='date'] + +strip: //p[@class='credit'] +strip: //p[@class='toplink'] +strip: //div[@id="ss-storyText"]/h1 +strip: //div[@class='socialButtons'] +strip: //div[@class='storyMeta socialshare'] +strip: //div[@class='socialShareWrapper'] +strip: //div[@id='socialshareprivacy'] +strip: //div[@class='storyMeta'] +strip: //div[@class='remote'] + +prune: no +tidy: no + +test_url: http://orf.at/stories/2317355/ +test_url: http://orf.at/stories/2084731/ +test_url: http://orf.at/stories/2317313/2317311/ +test_url: http://wien.orf.at/news/stories/2746414/ +test_url: http://ooe.orf.at/news/stories/2750613/ +test_url: http://science.orf.at/stories/2774991/ +test_url: http://orf.at/stories/2339962/ +test_url: http://orf.at/stories/2339958/ +test_url: http://help.orf.at/stories/1770242/ diff --git a/data/GrabberConfig/orientxxi.info.txt b/data/GrabberConfig/orientxxi.info.txt new file mode 100644 index 00000000..85e81976 --- /dev/null +++ b/data/GrabberConfig/orientxxi.info.txt @@ -0,0 +1,12 @@ + +body: //section[@id='chapo'] | //div[@itemprop='articleBody'] + +author: //meta[@property="article:author"]/@content + +strip_id_or_class: campagne_dons +strip_id_or_class: campagne_lettre + +tidy: no +prune: no + +test_url: https://orientxxi.info/magazine/hirak-stick-approach-the-only-response-of-moroccan-power,2562 diff --git a/data/GrabberConfig/origo.hu.txt b/data/GrabberConfig/origo.hu.txt new file mode 100644 index 00000000..50717f25 --- /dev/null +++ b/data/GrabberConfig/origo.hu.txt @@ -0,0 +1,18 @@ +title: /html/body/div[5]/div[2]/h1 +body: /html/body/div[5]/div[2]/div[6]/div/div +body: //*[@id="cikk"] +strip: /html/body/div[5]/div[2]/h1 +strip: /html/body/div[5]/div[2]/div[4] +strip: //*[@id="multidoboz"] +strip: /html/body/div[5]/div[2]/div[6]/div[2] +strip: //*[@id="comments"] +strip: //*[@id="rating-doboz"] +strip: /html/body/div[5]/div[2]/div[10] +strip: /html/body/div[5]/div[2]/a +strip: /html/body/div[5]/div[2]/span +strip: /html/body/div[5]/div[2]/span[2] +strip: /html/body/div[5]/div[2]/span[3] +strip: /html/body/div[5]/div[2]/span[4] +strip: /html/body/div[5]/div[2]/span[5] +strip: //*[@id="kommentszam"] +test_url: http://www.origo.hu/itthon/20110119-lemondott-a-kulturaert-felelos-helyettes-allamtitkar.html
\ No newline at end of file diff --git a/data/GrabberConfig/oschina.net.txt b/data/GrabberConfig/oschina.net.txt new file mode 100644 index 00000000..56451539 --- /dev/null +++ b/data/GrabberConfig/oschina.net.txt @@ -0,0 +1,3 @@ +title: //h1 +strip_id_or_class: syntaxhighlighter +test_url: http://www.oschina.net/translate/event-based-programming-what-async-has-over-sync?print
\ No newline at end of file diff --git a/data/GrabberConfig/ourworldindata.org.txt b/data/GrabberConfig/ourworldindata.org.txt new file mode 100644 index 00000000..2241e238 --- /dev/null +++ b/data/GrabberConfig/ourworldindata.org.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fourworldindata.org%2Fa-history-of-global-living-conditions-in-5-charts%2F + +body: //article[contains(concat(' ',normalize-space(@class),' '),' page ')] +test_url: https://ourworldindata.org/a-history-of-global-living-conditions-in-5-charts/
\ No newline at end of file diff --git a/data/GrabberConfig/outsideonline.com.txt b/data/GrabberConfig/outsideonline.com.txt new file mode 100644 index 00000000..ee559e4a --- /dev/null +++ b/data/GrabberConfig/outsideonline.com.txt @@ -0,0 +1,6 @@ +body: //div[@class="article__column--right"] +strip_id_or_class: block-recirc +strip_id_or_class: social +strip: //a[@data-pin-custom="true"] + +test_url: http://www.outsideonline.com/2108066/emerald-citys-velo-thieves-have-problem-bike-batman diff --git a/data/GrabberConfig/oxfordamerican.org.txt b/data/GrabberConfig/oxfordamerican.org.txt new file mode 100644 index 00000000..430c9d3f --- /dev/null +++ b/data/GrabberConfig/oxfordamerican.org.txt @@ -0,0 +1,9 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder + +title: //h2[contains(concat(' ',normalize-space(@class),' '),' itemTitle ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' itemFullText ')] +author: //span[contains(concat(' ',normalize-space(@class),' '),' itemAuthor ')] +date: //span[contains(concat(' ',normalize-space(@class),' '),' itemDateCreated ')] + +test_url: http://www.oxfordamerican.org/magazine/item/1066-sweet-bitter-blues diff --git a/data/GrabberConfig/palmbeachpost.com.txt b/data/GrabberConfig/palmbeachpost.com.txt new file mode 100644 index 00000000..58167a47 --- /dev/null +++ b/data/GrabberConfig/palmbeachpost.com.txt @@ -0,0 +1,3 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' story-text ')] + +test_url: http://www.palmbeachpost.com/news/crime--law/new-pbso-releases-sketch-person-interest-shooting/IcToH2Mij0hAw3EIFnw5tL/ diff --git a/data/GrabberConfig/pandodaily.com.txt b/data/GrabberConfig/pandodaily.com.txt new file mode 100644 index 00000000..a5d427af --- /dev/null +++ b/data/GrabberConfig/pandodaily.com.txt @@ -0,0 +1,5 @@ +tidy: no +body: //article +date: //time/@datetime +strip_id_or_class: sharedaddy +test_url: http://pandodaily.com/2012/01/19/ibooks-author-is-not-going-to-hurt-publishers-it-might-even-help-them/
\ No newline at end of file diff --git a/data/GrabberConfig/panic.com.txt b/data/GrabberConfig/panic.com.txt new file mode 100644 index 00000000..e0e2595c --- /dev/null +++ b/data/GrabberConfig/panic.com.txt @@ -0,0 +1,3 @@ +body: //div[@class='entry'] +date: //h3[@class='postDate'] +test_url: http://www.panic.com/blog/2011/07/panic-is-ready-for-lion/
\ No newline at end of file diff --git a/data/GrabberConfig/papodehomem.com.br.txt b/data/GrabberConfig/papodehomem.com.br.txt new file mode 100644 index 00000000..2c522da4 --- /dev/null +++ b/data/GrabberConfig/papodehomem.com.br.txt @@ -0,0 +1,6 @@ +title: //h2[@class="page_title"] +body: //div[@class="entry arquivo"] +author: //span[@class="author"] +footnotes: yes +prune: yes +test_url: http://papodehomem.com.br/um-relato-confessional-sobre-a-maioridade-penal/
\ No newline at end of file diff --git a/data/GrabberConfig/paquier.xyz.txt b/data/GrabberConfig/paquier.xyz.txt new file mode 100644 index 00000000..be995584 --- /dev/null +++ b/data/GrabberConfig/paquier.xyz.txt @@ -0,0 +1,3 @@ +body: //div[@class='post'] + +test_url: http://paquier.xyz/postgresql-2/postgres-10-incompatible-changes/ diff --git a/data/GrabberConfig/parislemon.com.txt b/data/GrabberConfig/parislemon.com.txt new file mode 100644 index 00000000..cd9bd55d --- /dev/null +++ b/data/GrabberConfig/parislemon.com.txt @@ -0,0 +1,6 @@ +title: //h2[@class="post-title"] +author: substring-after(//div[@class="description"],'Words by ') +date: //li[@class="date"] +strip: //h2[@class="post-title"] +body: //div[@class="copy"] +test_url: http://parislemon.com/post/13462682469/the-15-inch-air
\ No newline at end of file diff --git a/data/GrabberConfig/parliament.uk.txt b/data/GrabberConfig/parliament.uk.txt new file mode 100644 index 00000000..caaa2e94 --- /dev/null +++ b/data/GrabberConfig/parliament.uk.txt @@ -0,0 +1,3 @@ +title: //h1 +body: //div[@id='news-article'] +test_url: http://www.parliament.uk/business/committees/committees-a-z/commons-select/backbench-business-committee/news/guidance-for-e-petitioners/
\ No newline at end of file diff --git a/data/GrabberConfig/parool.nl.txt b/data/GrabberConfig/parool.nl.txt new file mode 100644 index 00000000..65fc5a98 --- /dev/null +++ b/data/GrabberConfig/parool.nl.txt @@ -0,0 +1,9 @@ +#bypass cookie check +single_page_link: //a[contains(@href, '/accept?url=')] + +test_url: http://www.parool.nl/parool/nl/4/AMSTERDAM/article/detail/4042734/2015/05/29/MRSA-bacterie-niet-verder-verspreid-in-Bijlmerbajes.dhtml +test_contains: De twee gevangenen die + +http_header(user-agent): Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) + +test_url: http://www.parool.nl/amsterdam/rss.xml diff --git a/data/GrabberConfig/pastebin.com.txt b/data/GrabberConfig/pastebin.com.txt new file mode 100644 index 00000000..03b67b7e --- /dev/null +++ b/data/GrabberConfig/pastebin.com.txt @@ -0,0 +1,6 @@ +title://div[@class="paste_box_line1"]/h1 +author://div[@class="paste_box_line2"]/a +body://div[@class="text"] +date:substring-before(substring-after(//div[@class="paste_box_line2"],'|'),'|') +dissolve://li +test_url: http://pastebin.com/LAykd1es
\ No newline at end of file diff --git a/data/GrabberConfig/pastepad.fivefilters.org.txt b/data/GrabberConfig/pastepad.fivefilters.org.txt new file mode 100644 index 00000000..c535158d --- /dev/null +++ b/data/GrabberConfig/pastepad.fivefilters.org.txt @@ -0,0 +1,5 @@ +title: //h1 +body: //div[@id='ff-pastepad-content'] +prune: no +# todo: add test file +test_url: http://pastepad.fivefilters.org/test.html
\ No newline at end of file diff --git a/data/GrabberConfig/pathawks.com.txt b/data/GrabberConfig/pathawks.com.txt new file mode 100644 index 00000000..25042224 --- /dev/null +++ b/data/GrabberConfig/pathawks.com.txt @@ -0,0 +1,8 @@ +title://*[contains(@class,'post-title')] +body://div[contains(@class,'post-body')] +body://div[contains(@class,'entry-content')] +strip_comments:no +prune:no +convert_double_br_tags:yes +tidy:yes +test_url: http://www.pathawks.com/2011/06/crazyawesomecoloradotrip.html
\ No newline at end of file diff --git a/data/GrabberConfig/pcmag.com.txt b/data/GrabberConfig/pcmag.com.txt new file mode 100644 index 00000000..96bdd95a --- /dev/null +++ b/data/GrabberConfig/pcmag.com.txt @@ -0,0 +1,10 @@ +prune:yes + +date://*[contains(@class,'date')] + +body://div[contains(@id,'content')] + +next_page_link://a[contains(.,'Next >')] + +strip_id_or_class:sponsors +test_url: http://www.pcmag.com/article2/0,2817,2401676,00.asp
\ No newline at end of file diff --git a/data/GrabberConfig/pcworld.com.txt b/data/GrabberConfig/pcworld.com.txt new file mode 100644 index 00000000..7193f87e --- /dev/null +++ b/data/GrabberConfig/pcworld.com.txt @@ -0,0 +1,19 @@ +title: //div[@class='articleHead']//h1 +author: //div[@class="author-name"]/a[1] +body: //div[@class="main"] + +# remove 'From the Lab' and 'Recent posts' text +strip: //div[@class='blogLabel'] + +# remove byline and meta info +strip: //h1 +strip: //div[@class="article-meta"] +strip: //div[@class="author-info"] + +#strip tags and categories +strip: //div[@class="department"] + +#strip product cap links +strip: //div[@class="cap-main"] +strip: //div[@id="compare-lede"] +test_url: http://www.pcworld.com/article/262034/are-printer-companies-gouging-us-on-laser-toner-pricing.html
\ No newline at end of file diff --git a/data/GrabberConfig/penny-arcade.com.txt b/data/GrabberConfig/penny-arcade.com.txt new file mode 100644 index 00000000..a0d5099e --- /dev/null +++ b/data/GrabberConfig/penny-arcade.com.txt @@ -0,0 +1,23 @@ +# 2012-01-14 carlo@... - fixed title, body; added author, date + +title: //div[@class="title"]/h2/a +# body: //div[@class="post"] +# author: //p[@class="iconEmail"]/a +# date: //p[@class="iconDate"] + +# 1/24/2013 yosoyju - fixed author, date, and body, added support for PA Report + +# Penny Arcade + +author: //li[@class="iconEmail"]/a +date: //li[@class="iconDate"] +body: //div[@class="body"] + +# PA Report + +author: //div[@class="meta"]/p/a +date: substring-after(//div[@class="meta"]/p, '/ ') +title: substring-after(//title, '- ') + +test_url: http://penny-arcade.com/2012/01/13/i-put-some-news-in-your-news +test_url: http://penny-arcade.com/report/editorial-article/the-dystopian-future-of-casual-games-personalized-targeted-pricing-and-mech
\ No newline at end of file diff --git a/data/GrabberConfig/pentaxforums.com.txt b/data/GrabberConfig/pentaxforums.com.txt new file mode 100644 index 00000000..00f61a48 --- /dev/null +++ b/data/GrabberConfig/pentaxforums.com.txt @@ -0,0 +1,2 @@ +next_page_link: //a[contains(., 'Next:')] +test_url: http://www.pentaxforums.com/reviews/long-exposure-handhelds/introduction.html
\ No newline at end of file diff --git a/data/GrabberConfig/phastidio.net.txt b/data/GrabberConfig/phastidio.net.txt new file mode 100644 index 00000000..ae06172d --- /dev/null +++ b/data/GrabberConfig/phastidio.net.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fphastidio.net%2F2015%2F11%2F10%2Fa-sportellate-sul-renzismo%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: http://phastidio.net/2015/11/10/a-sportellate-sul-renzismo/ diff --git a/data/GrabberConfig/philadelphiaeagles.com.txt b/data/GrabberConfig/philadelphiaeagles.com.txt new file mode 100644 index 00000000..5ba5f772 --- /dev/null +++ b/data/GrabberConfig/philadelphiaeagles.com.txt @@ -0,0 +1,6 @@ +prune: no +tidy: no +body: //div[@class='article-content'] +dissolve: //nobr/a +dissolve: //nobr +test_url: http://www.philadelphiaeagles.com/news/article-1/Jacksons-Light-Shined-On-Sunday-Night/51a862de-42b4-40f1-a5a8-ba0fb8a435b7
\ No newline at end of file diff --git a/data/GrabberConfig/philstar.com.txt b/data/GrabberConfig/philstar.com.txt new file mode 100644 index 00000000..60136f3a --- /dev/null +++ b/data/GrabberConfig/philstar.com.txt @@ -0,0 +1,4 @@ +body: //div[@property="content:encoded"] + +test_url: http://www.philstar.com/headlines/2017/03/06/1678561/samuel-martires-named-new-sc-justice +test_url: http://www.philstar.com/rss/breakingnews diff --git a/data/GrabberConfig/phoronix.com.txt b/data/GrabberConfig/phoronix.com.txt new file mode 100644 index 00000000..c7421115 --- /dev/null +++ b/data/GrabberConfig/phoronix.com.txt @@ -0,0 +1,8 @@ +# based on the grabber rules of picofeed + +body: //div[@class="content"] +test_url: http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1 + +# replace_string(<h5>): <h2> + +next_page_link: //a[@title='Go To Next Page'] diff --git a/data/GrabberConfig/photo.tutsplus.com.txt b/data/GrabberConfig/photo.tutsplus.com.txt new file mode 100644 index 00000000..7f7e3830 --- /dev/null +++ b/data/GrabberConfig/photo.tutsplus.com.txt @@ -0,0 +1,6 @@ +author: substring-before(//div[@class='post_meta'],' on') +date: substring-after(substring-before(//div[@class='post_meta'],'with'),' on') +title: //h1[class='post_title'] +body: //div[@class='article'] + +test_url: http://photo.tutsplus.com/articles/news/a-brilliant-beginners-guide-to-architectural-photography/
\ No newline at end of file diff --git a/data/GrabberConfig/phototrend.fr.txt b/data/GrabberConfig/phototrend.fr.txt new file mode 100644 index 00000000..55731763 --- /dev/null +++ b/data/GrabberConfig/phototrend.fr.txt @@ -0,0 +1,4 @@ +strip_id_or_class: post-tags +strip_id_or_class: post-header + +test_url: https://phototrend.fr/2016/10/symetrie-instagram-symmetrical-monsters/ diff --git a/data/GrabberConfig/php.net.txt b/data/GrabberConfig/php.net.txt new file mode 100644 index 00000000..cc643f05 --- /dev/null +++ b/data/GrabberConfig/php.net.txt @@ -0,0 +1,6 @@ +body: //div[@id='content'] +strip_id_or_class: manualnavbar + +prune: no + +test_url: http://www.php.net/manual/en/migration5.incompatible.php
\ No newline at end of file diff --git a/data/GrabberConfig/pieria.co.uk.txt b/data/GrabberConfig/pieria.co.uk.txt new file mode 100644 index 00000000..31f2c84a --- /dev/null +++ b/data/GrabberConfig/pieria.co.uk.txt @@ -0,0 +1,3 @@ +body: //div[@id='article'] + +test_url: http://www.pieria.co.uk/articles/need_a_fiscal_rule_george__try_get_the_economy_growing diff --git a/data/GrabberConfig/pinterest.com.txt b/data/GrabberConfig/pinterest.com.txt new file mode 100644 index 00000000..b35c00f6 --- /dev/null +++ b/data/GrabberConfig/pinterest.com.txt @@ -0,0 +1,5 @@ +title: //title +body: //div[contains(@class, 'imageContainer')] + +test_url: http://pinterest.com/pin/380906080954441188/ +test_url: http://pinterest.com/michaelsorm/architecture.rss diff --git a/data/GrabberConfig/pitchfork.com.txt b/data/GrabberConfig/pitchfork.com.txt new file mode 100644 index 00000000..eee96a9c --- /dev/null +++ b/data/GrabberConfig/pitchfork.com.txt @@ -0,0 +1,16 @@ +title:concat(//h1,' - ',//h2,' - ',//h3) +author://address +date://span[@class='pub-date'] +body://div[@id='main'] +single_page_link://link[@rel='canonical'] +strip://div[@class='info'] +strip_id_or_class:'object-grid related-content' +strip_id_or_class:'object-prevnext' +strip_id_or_class:'object-header' +strip_id_or_class:'source' +strip_id_or_class:'label' +strip_id_or_class:'title' +dissolve://ul +strip://li[@class='next'] +strip://li[@class='prev'] +test_url: http://pitchfork.com/features/why-we-fight/8796-on-the-far-slope-of-the-uncanny-valley/
\ No newline at end of file diff --git a/data/GrabberConfig/pittsburghmagazine.com.txt b/data/GrabberConfig/pittsburghmagazine.com.txt new file mode 100644 index 00000000..4d02f6bb --- /dev/null +++ b/data/GrabberConfig/pittsburghmagazine.com.txt @@ -0,0 +1,8 @@ +title: //title +author: substring-after(//div[@class='by-line'],'BY') + +body: //div[@id='article-body'] + +strip: //div[@class='by-line'] +strip: //div[@id='article-body']/h1 +test_url: http://www.pittsburghmagazine.com/Pittsburgh-Magazine/May-2012/Verde-Lights-the-Night/
\ No newline at end of file diff --git a/data/GrabberConfig/pittsburghpanthers.com.txt b/data/GrabberConfig/pittsburghpanthers.com.txt new file mode 100644 index 00000000..c372284a --- /dev/null +++ b/data/GrabberConfig/pittsburghpanthers.com.txt @@ -0,0 +1,4 @@ +title: //span[@class='StoryHeadline'] +strip: //div[@class='fivevert'] +body: //div[@id='Content'] +test_url: http://www.pittsburghpanthers.com/sports/m-baskbl/recaps/031412aaa.html
\ No newline at end of file diff --git a/data/GrabberConfig/pittscriptblog.com.txt b/data/GrabberConfig/pittscriptblog.com.txt new file mode 100644 index 00000000..571874a4 --- /dev/null +++ b/data/GrabberConfig/pittscriptblog.com.txt @@ -0,0 +1,8 @@ +title: //h1[@class='articletitle'] +author: substring-after(//span[@class='author'],'by') +date: //span[@class='created'] +body: //div[@class='article'] +strip: //div[@class='headline'] +strip: //p[@class='articleinfo'] +#dissolve: //p[@class='subheader'] +test_url: http://www.pittscriptblog.com/2012-articles/march/2012-football-opponents-set-and-the-attendance-dilemma.html
\ No newline at end of file diff --git a/data/GrabberConfig/pixellibre.net.txt b/data/GrabberConfig/pixellibre.net.txt new file mode 100644 index 00000000..cd501b11 --- /dev/null +++ b/data/GrabberConfig/pixellibre.net.txt @@ -0,0 +1,4 @@ +title: //h1[@class="entry-title"] +author: //a[@class='url fn n'] + +test_url: https://pixellibre.net/2017/10/vie-privee-smartphones-applications/ diff --git a/data/GrabberConfig/pjmedia.com.txt b/data/GrabberConfig/pjmedia.com.txt new file mode 100644 index 00000000..79f8c166 --- /dev/null +++ b/data/GrabberConfig/pjmedia.com.txt @@ -0,0 +1,6 @@ +find_string:display:none +replace_string: .. + +single_page_link: //div[@class='single-page-button']//a + +test_url: https://pjmedia.com/eddriscoll/2016/08/31/tom-wolfe-kingdom-of-speech/ diff --git a/data/GrabberConfig/placegrenet.fr.txt b/data/GrabberConfig/placegrenet.fr.txt new file mode 100644 index 00000000..bd9d36ea --- /dev/null +++ b/data/GrabberConfig/placegrenet.fr.txt @@ -0,0 +1,23 @@ +title: //h1[contains(concat(' ',normalize-space(@class),' '),' entry-title ')] + +author: //span[@class='auteurcat'] + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] + +find_string: <strong>FOCUS – +replace_string: <strong> + +strip: //div[contains(concat(' ',normalize-space(@class),' '),' ms-auth-header ')]/ancestor::fieldset + +# Wallabag-specific login directives (not supported in FTR): +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' ms-auth-header ')] +login_uri: https://www.placegrenet.fr/wp-login.php +login_username_field: log +login_password_field: pwd +login_extra_fields: rememberme=forever +login_extra_fields: wp-submit=Se connecter +login_extra_fields: testcookie=1 + +test_url: https://www.placegrenet.fr/2018/07/27/condition-animale-pressee-par-des-associations-la-ville-de-grenoble-vote-un-voeu-pour-interpeller-letat/201804 + diff --git a/data/GrabberConfig/planet3dnow.de.txt b/data/GrabberConfig/planet3dnow.de.txt new file mode 100644 index 00000000..23db6b00 --- /dev/null +++ b/data/GrabberConfig/planet3dnow.de.txt @@ -0,0 +1,16 @@ +# Author: Jan Lukas Gernert + +tidy: no +prune: no + +title: //title +author: //span[@class='author vcard']/a + +body: //article +next_page_link: //span[@class='multipage-navlink nav-next']/a + +strip: //div[@id='pagination_top'] +strip: //header[@class='entry-header'] +strip: //span[@class='multipage-navlink nav-previous'] + +test_url: http://www.planet3dnow.de/cms/14809-test-fractal-design-node-804/ diff --git a/data/GrabberConfig/planetvita.de.txt b/data/GrabberConfig/planetvita.de.txt new file mode 100644 index 00000000..bfc3342d --- /dev/null +++ b/data/GrabberConfig/planetvita.de.txt @@ -0,0 +1,5 @@ +title: //div[@id='frnRahmen']/div/div[@id='content']/div[2]/h2 +author: //div[@id='content']/div[1]/div/a +body: //div[@id='content']/div[2]/span +strip: //div[@id='commenthead'] +test_url: http://www.planetvita.de/news/10389-psn-store-update-vom-03-april-neue-inhalte-fuer-psvita.html
\ No newline at end of file diff --git a/data/GrabberConfig/playboy.com.txt b/data/GrabberConfig/playboy.com.txt new file mode 100644 index 00000000..3031bf03 --- /dev/null +++ b/data/GrabberConfig/playboy.com.txt @@ -0,0 +1,8 @@ +author: //article//*[@class="author"] +date: //article//*[@class="publication-date"] +body: //article +strip: //article/header +strip: //article/section +strip_id_or_class: cta-image + +test_url: https://www.playboy.com/read/forget-camming-adult-entertainment-is-taking-over-snapchat diff --git a/data/GrabberConfig/playgroupnsw.org.au.txt b/data/GrabberConfig/playgroupnsw.org.au.txt new file mode 100644 index 00000000..d8ace672 --- /dev/null +++ b/data/GrabberConfig/playgroupnsw.org.au.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.playgroupnsw.org.au%2FParentResources%2FParentingYoungChildren%2Fcalm-ways-discipline-child + +body: //div[contains(concat(' ',normalize-space(@class),' '),' collection-intro-main ')] +test_url: http://www.playgroupnsw.org.au/ParentResources/ParentingYoungChildren/calm-ways-discipline-child
\ No newline at end of file diff --git a/data/GrabberConfig/ploum.net.txt b/data/GrabberConfig/ploum.net.txt new file mode 100644 index 00000000..bbd454ae --- /dev/null +++ b/data/GrabberConfig/ploum.net.txt @@ -0,0 +1,8 @@ +body: //article[contains(concat(' ',normalize-space(@class),' '),' post ')] + +strip_id_or_class: readoffline-embed +strip_id_or_class: fasteasysocialsharing +strip_id_or_class: entry-tags +strip_id_or_class: entry-author-bottom + +test_url: https://ploum.net/sunrise-le-calendrier-du-futur/ diff --git a/data/GrabberConfig/plus.google.com.txt b/data/GrabberConfig/plus.google.com.txt new file mode 100644 index 00000000..ea36f5b7 --- /dev/null +++ b/data/GrabberConfig/plus.google.com.txt @@ -0,0 +1,14 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' Ig ')]//div//div[contains(concat(' ',normalize-space(@class),' '),' Ct ')] + +author: //div[@id='contentPane']//h3//a[@rel="author"]/@content + +strip: //*[@title="People who +1'd this"]/../.. +strip: //*[contains(@class, 'a-b-f-i-Hg-Uf')] +strip: //*[@role='menu'] +strip: //img[contains(@alt, 'profile photo')] +strip: //*[@class='a-f-i-Ad'] + +tidy: no + +test_url: http://plus.google.com/u/0/117840649766034848455/posts/FddaP6jeCqp +test_url: https://plus.google.com/+googleplus/posts/d1XubVAZ5hV diff --git a/data/GrabberConfig/plzkthxbai.com.txt b/data/GrabberConfig/plzkthxbai.com.txt new file mode 100644 index 00000000..ec151b42 --- /dev/null +++ b/data/GrabberConfig/plzkthxbai.com.txt @@ -0,0 +1,4 @@ +title: //h2[@class='jcw-pagetitle' +date: //p[@class='postinfo'] +body: //div[@class='contenttext'] +test_url: http://plzkthxbai.com/blog/2011/06/28/1password-and-internet-security/
\ No newline at end of file diff --git a/data/GrabberConfig/pmf.silvrback.com.txt b/data/GrabberConfig/pmf.silvrback.com.txt new file mode 100644 index 00000000..b15daa44 --- /dev/null +++ b/data/GrabberConfig/pmf.silvrback.com.txt @@ -0,0 +1,2 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' all_external_links ')] +test_url: https://pmf.silvrback.com/fixing-tethering-on-android-kitkat
\ No newline at end of file diff --git a/data/GrabberConfig/pogue.blogs.nytimes.com.txt b/data/GrabberConfig/pogue.blogs.nytimes.com.txt new file mode 100644 index 00000000..65ddba54 --- /dev/null +++ b/data/GrabberConfig/pogue.blogs.nytimes.com.txt @@ -0,0 +1,4 @@ +body: //div[@id="content"]/div[1] + +title: //h1[@class="entry-title"] +test_url: http://pogue.blogs.nytimes.com/2011/05/12/the-future-of-skype/
\ No newline at end of file diff --git a/data/GrabberConfig/politico.com.txt b/data/GrabberConfig/politico.com.txt new file mode 100644 index 00000000..d8f5e575 --- /dev/null +++ b/data/GrabberConfig/politico.com.txt @@ -0,0 +1,13 @@ +title://div[contains(@class, "article")]/h1 +body://div[contains(@class,"story-text")] + +# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] + +next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a +date://meta[@name="publish_date"]/@content + +strip://div[contains(@class, "breadcrumbs")] +strip://a[contains(@class, "hidden")] +strip://div[contains(@class, "story-embed")] +strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. +test_url: http://www.politico.com/news/stories/0712/78105.html
\ No newline at end of file diff --git a/data/GrabberConfig/politifact.com.txt b/data/GrabberConfig/politifact.com.txt new file mode 100644 index 00000000..65a8fc57 --- /dev/null +++ b/data/GrabberConfig/politifact.com.txt @@ -0,0 +1,4 @@ +body: //div[@id="content"] + +strip: //div[@class="pfcontentmid"]/div[position()>4]|//div[@class="pfad"] +test_url: http://www.politifact.com/truth-o-meter/statements/2011/may/30/barbara-boxer/barbara-boxer-says-medicare-overhead-far-lower-pri/
\ No newline at end of file diff --git a/data/GrabberConfig/politiken.dk.txt b/data/GrabberConfig/politiken.dk.txt new file mode 100644 index 00000000..b13f8f87 --- /dev/null +++ b/data/GrabberConfig/politiken.dk.txt @@ -0,0 +1,13 @@ +# 21/10-2011: +# Added Author+Date +# Remove fakta-boks if found +# Deleted 'Læs også...' filter +# - Change in markup caused it to strip too much. + +author://span[@class='autor-name'] +date:substring-after(//div[@class='art-created'], ' ') +title: //h1[contains(@class, 'stor-type')] +body: //div[@id='art-body'] +strip: //div[@class='art-fakta article-box'] + +test_url: http://politiken.dk/kultur/boger/skonlitteratur_boger/ECE1426386/makabre-tegneserie-zombier-aeder-alt-levende/
\ No newline at end of file diff --git a/data/GrabberConfig/politis.fr.txt b/data/GrabberConfig/politis.fr.txt new file mode 100644 index 00000000..1a389d45 --- /dev/null +++ b/data/GrabberConfig/politis.fr.txt @@ -0,0 +1,13 @@ +author: //div[contains(concat(' ',normalize-space(@class),' '),' article-author-sign ')]//a[@itemprop='author'] +body://div[contains(concat(' ',normalize-space(@class),' '),' article-text ')] + +test_url: https://www.politis.fr/articles/2018/07/john-coltrane-linfinie-creation-39113/ + +# Wallabag-specific login directives (not supported in FTR) +requires_login: yes +login_uri: https://www.politis.fr/compte/login/?next=referrer +login_username_field: login +login_password_field: password +not_logged_in_xpath: //h6[contains(concat(' ',normalize-space(@class),' '),' register-headline ')] + + diff --git a/data/GrabberConfig/polygon.com.txt b/data/GrabberConfig/polygon.com.txt new file mode 100644 index 00000000..8fe9b1be --- /dev/null +++ b/data/GrabberConfig/polygon.com.txt @@ -0,0 +1,34 @@ +body: //div[@id='article-content'] +body: //article[@id='entry-top']/div[@class='float_wrapper'] +author: //header/p[@class='byline']/em/a +date: //header/p[@class='byline']/span[@class='timestamp'] + +strip: //div[@id='article-content']//header +strip: //label + +#photos on left column (delete all) +strip: //div[@class='big_photo'] + +#photos on left column (remove extras used for scroll effect) +#strip: //div[@class='big_photo']/div[./img] +#strip: //div[@class='big_photo']/img[position()>1] + +strip_id_or_class: vox-lazy-load +strip_id_or_class: social_buttons +strip_id_or_class: feature_toc + +prune: no + +find_string: <noscript> +replace_string: <div> +find_string: </noscript> +replace_string: </div> + +#find_string: <script +#replace_string: <div style="display:none" +#find_string: </script> +#replace_string: </div> + +strip: //div[@class='float_wrapper']/header +test_url: http://www.polygon.com/2013/4/5/4189028/donkey-kong-country-returns-3d-new-content +test_url: http://www.polygon.com/features/2013/8/22/4602568/30-years-xbox-360-playstation-3-wii
\ No newline at end of file diff --git a/data/GrabberConfig/popehat.com.txt b/data/GrabberConfig/popehat.com.txt new file mode 100644 index 00000000..45b76ae1 --- /dev/null +++ b/data/GrabberConfig/popehat.com.txt @@ -0,0 +1,6 @@ +title: //div[@class='entry-title'] +author: //div[@class='entry-author'] +date: //div[@class='entry-time'] +body: //div[@class='entry-content'] + +test_url: https://popehat.com/2015/12/16/eric-posner-the-first-amendments-nemesis/ diff --git a/data/GrabberConfig/popsci.com.txt b/data/GrabberConfig/popsci.com.txt new file mode 100644 index 00000000..2a928e26 --- /dev/null +++ b/data/GrabberConfig/popsci.com.txt @@ -0,0 +1,11 @@ +body: //div[contains(@class, 'field-body')] +strip_image_src: placeholder +find_string: <noscript> +replace_string: <div> +find_string: </noscript> +replace_string: </div> + +prune: no + +test_url: http://www.popsci.com/nasa-crashed-another-plane-today +test_url: http://www.popsci.com/rss.xml
\ No newline at end of file diff --git a/data/GrabberConfig/popularmechanics.com.txt b/data/GrabberConfig/popularmechanics.com.txt new file mode 100644 index 00000000..2582e6fb --- /dev/null +++ b/data/GrabberConfig/popularmechanics.com.txt @@ -0,0 +1,8 @@ +next_page_link: //div[@id='longPagination']/a[@class='next'] + +title: //div[@id='contentHeader']//h1 + +body: //div[@id='articleBody'] +# this is so sad +body: //div[@id='intelliTXT'] +test_url: http://www.popularmechanics.com/technology/aviation/crashes/what-really-happened-aboard-air-france-447-6611877
\ No newline at end of file diff --git a/data/GrabberConfig/portertech.ca.txt b/data/GrabberConfig/portertech.ca.txt new file mode 100644 index 00000000..2897cb57 --- /dev/null +++ b/data/GrabberConfig/portertech.ca.txt @@ -0,0 +1,3 @@ +author: //*[(@class = "author")] +date: //*[(@class = "date")] +test_url: http://portertech.ca/2012/12/10/iac-morning-market/
\ No newline at end of file diff --git a/data/GrabberConfig/positioningmag.com.txt b/data/GrabberConfig/positioningmag.com.txt new file mode 100644 index 00000000..f8eeb0a3 --- /dev/null +++ b/data/GrabberConfig/positioningmag.com.txt @@ -0,0 +1,19 @@ +title: //div[@id="newsDetailTitle"] +author: //span[@id="showAuthor"] +date: //span[@id="showRefDate"] + +strip: //div[@id="breadcrumbs"] +strip: //span[@id="PageTitle"] +strip: //div[@id="newsDetailAuthorPublish"] + +strip: //div[@class="leadPix"] + +strip: //span[@id="ctl00_PageTitle"] +strip: //div[@id="newsDetailTitle"] +convert_double_br_tags:yes + +strip: //div[@id="newsDetailCredential"] +strip: //div[@id="sidebar2"] +strip: //div[@id="footer"] + +test_url: http://www.positioningmag.com/magazine/details.aspx?id=41083
\ No newline at end of file diff --git a/data/GrabberConfig/posta.com.tr.txt b/data/GrabberConfig/posta.com.tr.txt new file mode 100644 index 00000000..0f01149c --- /dev/null +++ b/data/GrabberConfig/posta.com.tr.txt @@ -0,0 +1,15 @@ +title: //div[@id='divAdnetKeyword']/h1 +body: //div[@id='_middle_content_bottom'] + +wrap_in(fieldset)://div[@id='_middle_content_bottom_child2']/img + +strip: //div[@id='_middle_content_bottom_child1'] +strip: //div[@id='_middle_content_bottom_child4'] +strip: //div[@class='cls'] +strip: //div[@class='iphoneBox'] +strip: //ul[@class='ilgiliHaber'] +strip: //div[@class='yorumlar'] +strip: //div[@class='kategoriler'] +strip: //div[@class='textSize'] +strip: //span[@class='tarih'] +test_url: http://www.posta.com.tr/yasam/teknoloji/HaberDetay/Fedailer_Istanbul_da.htm?ArticleID=101044
\ No newline at end of file diff --git a/data/GrabberConfig/presse-citron.net.txt b/data/GrabberConfig/presse-citron.net.txt new file mode 100644 index 00000000..6262ab8c --- /dev/null +++ b/data/GrabberConfig/presse-citron.net.txt @@ -0,0 +1,9 @@ +title: //header[@class='post-header cf']/h1[@class='post-title'] +author: //div[@class="post-meta"]/span[@class='posted-by']/span[@class='reviewer']/a +date: //div[@class="post-meta"]/span[@class='posted-on']/span[@class='dtreviewed']/time/@datetime +body: //div[@class="post-content description"] + +strip: //div[@id="essb_links essb_counters essb_displayed_bottom essb_share essb_template_fancy-retina essb_1179534949 essb_links_right print-no"] +strip: //div[@class="tagcloud"] + +test_url: http://www.presse-citron.net/voici-comment-vous-pourrez-customiser-vos-fils-dactualite-facebook/ diff --git a/data/GrabberConfig/presseportal.de.txt b/data/GrabberConfig/presseportal.de.txt new file mode 100644 index 00000000..703806d8 --- /dev/null +++ b/data/GrabberConfig/presseportal.de.txt @@ -0,0 +1,11 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' story-text ')] + +strip_id_or_class: news-bodycopy + +parser: html5php +tidy: no + +test_url: http://www.presseportal.de/pm/103258/2930232/felix-neureuther-vor-der-ski-wm-ich-denke-von-rennen-zu-rennen +test_url: http://www.presseportal.de/pm/66749/2933779/koelner-stadt-anzeiger-bahnmitarbeiter-werden-in-nrw-immer-haeufiger-angegriffen-zahl-der/rss +test_contains: kleineren Bahnhöfen installieren und erhofft +test_url: http://www.presseportal.de/rss/presseportal.rss2 diff --git a/data/GrabberConfig/privacyinternational.org.txt b/data/GrabberConfig/privacyinternational.org.txt new file mode 100644 index 00000000..d722ff0f --- /dev/null +++ b/data/GrabberConfig/privacyinternational.org.txt @@ -0,0 +1,6 @@ +title: //h1[contains(@class, 'page-title')] + +body: //div[@id='block-privacy-international-content']/div[contains(@class, 'group-left')] +prune: no + +test_url: https://privacyinternational.org/feature/2433/i-asked-online-tracking-company-all-my-data-and-heres-what-i-found diff --git a/data/GrabberConfig/pro-linux.de.txt b/data/GrabberConfig/pro-linux.de.txt new file mode 100644 index 00000000..57bf7010 --- /dev/null +++ b/data/GrabberConfig/pro-linux.de.txt @@ -0,0 +1,19 @@ +tidy: no +prune: no + +title: //h2[@class='title'] +date: //div[@class='tidescr'] + +body: //div[@id='article'] +body: //div[@id='news'] + +next_page_link: //a[@title='nächste'] + +strip: //div[@class='topic'] +strip: //h2[@class='title'] +strip: //div[a[contains(@href,'/user/')]] +strip: //div[@class='picto'] +strip: //p[@class='addinfo'] +strip: //h3[@class='topic'] + +test_url: http://www.pro-linux.de/artikel/2/1762/ubuntu-und-kubuntu-1504.html diff --git a/data/GrabberConfig/prog21.dadgum.com.txt b/data/GrabberConfig/prog21.dadgum.com.txt new file mode 100644 index 00000000..9a49557e --- /dev/null +++ b/data/GrabberConfig/prog21.dadgum.com.txt @@ -0,0 +1,9 @@ +title: //h1 +body: //div[@id='left'] +strip: //h1 +convert_double_br_tags: yes +strip_id_or_class: entry-footer +strip: //h1[. = 'Previously']/following::* +author: string('James Hague') +date: //div[@class = 'entry-footer']/text() +test_url: http://prog21.dadgum.com/105.html
\ No newline at end of file diff --git a/data/GrabberConfig/prolost.com.txt b/data/GrabberConfig/prolost.com.txt new file mode 100644 index 00000000..82ebf6bb --- /dev/null +++ b/data/GrabberConfig/prolost.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='body'] +title: //h2[@class='title'] +date: //span[@class='posted-on'] +test_url: http://prolost.com/blog/2011/10/13/real-men-comp-with-film.html
\ No newline at end of file diff --git a/data/GrabberConfig/propakistani.pk.txt b/data/GrabberConfig/propakistani.pk.txt new file mode 100644 index 00000000..ea316e47 --- /dev/null +++ b/data/GrabberConfig/propakistani.pk.txt @@ -0,0 +1,3 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' content-post ')] +test_url: https://propakistani.pk/2017/06/23/eid-mubarak-happy-holidays-everyone/ +test_contains: away from work for Eid Holidays diff --git a/data/GrabberConfig/propublica.org.txt b/data/GrabberConfig/propublica.org.txt new file mode 100644 index 00000000..d141ac90 --- /dev/null +++ b/data/GrabberConfig/propublica.org.txt @@ -0,0 +1,11 @@ +title: //h1[@class="article-title"] +author: //meta[@name="author"]/@content +body: //div[@class="article-full"] +strip_id_or_class: sidebar_inject +strip_id_or_class: callout +strip_id_or_class: content-inset +strip_id_or_class: byline-block +strip_id_or_class: photo-caption +strip_id_or_class: foot-tools + +test_url: http://www.propublica.org/article/pardon-applicants-benefit-from-friends-in-high-places
\ No newline at end of file diff --git a/data/GrabberConfig/prospectmagazine.co.uk.txt b/data/GrabberConfig/prospectmagazine.co.uk.txt new file mode 100644 index 00000000..739d1b9e --- /dev/null +++ b/data/GrabberConfig/prospectmagazine.co.uk.txt @@ -0,0 +1,26 @@ +#basics +author: (//div[contains(@class,'author')])[1] +date: substring-before(//a[@class='issue'], '—') +#body://div[@class = 'entry'] +# use this until move_into support is ready +body: //div[@class = 'entry' or @class='standfirst' or @class='lead_image'] + +#moves header image and tagline into body +move_into(//div[@class='entry']/div)://div[@class = 'lead_image'] +move_into(//div[@class='entry']/div)://div[@class = 'standfirst'] + + +# moves author info to end of text +move_into(//p[strong[string(.) = 'Follow Prospect on Twitter']])://div[@id='sidebar_content']/p/em + +prune: no + +# strips social links +strip_id_or_class:login-status +strip_id_or_class:shareinpost +strip_id_or_class:content_subscribe +strip_id_or_class:postinfo +strip_id_or_class:postutils +strip_id_or_class:comments +strip://strong[string(.) = 'Follow Prospect on Twitter'] +test_url: http://www.prospectmagazine.co.uk/2011/07/postmodernism-is-dead-va-exhibition-age-of-authenticism/
\ No newline at end of file diff --git a/data/GrabberConfig/protothema.gr.txt b/data/GrabberConfig/protothema.gr.txt new file mode 100644 index 00000000..fae261b0 --- /dev/null +++ b/data/GrabberConfig/protothema.gr.txt @@ -0,0 +1,6 @@ +body: //a[contains(@rel, 'mainphotos')] | //div[contains(@class, 'article-content')] + +prune: no + +test_url: http://www.protothema.gr//politics/article/326464/diamadopoulou-floridis-kaminis-kai-boutaris-se-ekdilosi-ton-europaion-fileleutheron/ +test_url: http://www.protothema.gr/rss/news/politics/
\ No newline at end of file diff --git a/data/GrabberConfig/psychologytoday.com.txt b/data/GrabberConfig/psychologytoday.com.txt new file mode 100644 index 00000000..1bb63c29 --- /dev/null +++ b/data/GrabberConfig/psychologytoday.com.txt @@ -0,0 +1,9 @@ +title: //div[@class="page-title"]/h1 +author: //a[@title="View Bio"] +date: substring-before(substring-after(//span[@class="submitted"], 'Published on '), ' by') +strip://div[@class="page-title"]/h1 +strip://div[@class="article-abstract"] +strip://div[@class="article-meta"] +strip://div[@id="rightColumn"] +strip://div[@id="inline-content-bottom-left"] +test_url: http://www.psychologytoday.com/blog/how-happiness/201205/my-quibble-facebook
\ No newline at end of file diff --git a/data/GrabberConfig/publications.parliament.uk.txt b/data/GrabberConfig/publications.parliament.uk.txt new file mode 100644 index 00000000..8f32d7a4 --- /dev/null +++ b/data/GrabberConfig/publications.parliament.uk.txt @@ -0,0 +1,4 @@ +author: //meta[@name="Author"] +date: //meta[@name="Date"] +strip: //h5 +test_url: http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/111109-0003.htm
\ No newline at end of file diff --git a/data/GrabberConfig/publico.pt.txt b/data/GrabberConfig/publico.pt.txt new file mode 100644 index 00000000..bb6a05e1 --- /dev/null +++ b/data/GrabberConfig/publico.pt.txt @@ -0,0 +1,12 @@ +title: //h1[@class="entry-title"] +author: //span[@class="author"] +body: //article[@itemtype="http://schema.org/Article"] +date: //time[@itemprop="dateCreated"] + +strip: //header[@class="entry-header single-header"] +strip: //aside[@class="entry-assets"] +strip: //div[@class="entry-options entry-options-above group"] +strip: //div[@class="entry-options entry-options-below group"] + +convert_double_br_tags: yes +test_url: http://www.publico.pt/politica/noticia/passos-diz-que-se-limitacao-de-mandatos-fosse-para-todos-os-concelhos-estaria-claro-na-lei-1577691
\ No newline at end of file diff --git a/data/GrabberConfig/puri.sm.txt b/data/GrabberConfig/puri.sm.txt new file mode 100644 index 00000000..0e3ff514 --- /dev/null +++ b/data/GrabberConfig/puri.sm.txt @@ -0,0 +1,7 @@ +title: //h1[@class="entry-title"] +author: //h3[@class="fn name"] +date: //time[contains(@class, "published")]/@datetime +body: //div[@class="entry-content"] +strip: //div[@class="abh_box abh_box_up abh_box_business"] + +test_url: https://puri.sm/posts/introducing-the-librem-key/ diff --git a/data/GrabberConfig/putaindecode.io.txt b/data/GrabberConfig/putaindecode.io.txt new file mode 100644 index 00000000..45825935 --- /dev/null +++ b/data/GrabberConfig/putaindecode.io.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fputaindecode.io%2Ffr%2Farticles%2Ffavicon%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' putainde-Post-md ')]//div +test_url: http://putaindecode.io/fr/articles/favicon/ diff --git a/data/GrabberConfig/putsch.media.txt b/data/GrabberConfig/putsch.media.txt new file mode 100644 index 00000000..531d379e --- /dev/null +++ b/data/GrabberConfig/putsch.media.txt @@ -0,0 +1,27 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' full-width-media" ')] + +date: //time/@datetime + +prune: no + +strip: //time +strip: //script +strip_id_or_class: author +strip_id_or_class: private +strip_id_or_class: entry-title +strip_id_or_class: social-fixed + +test_url: https://putsch.media/20180628/culture/art-et-expos/la-nuit-des-eglises-a-la-decouverte-dun-patrimoine-francais/ + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +# login required if both the login form and the private div are present: +not_logged_in_xpath: //form[@id='login_form'] +login_uri: https://putsch.media/wp-ajax.php +login_username_field: user +login_password_field: pass +login_extra_fields: action=login + diff --git a/data/GrabberConfig/pymotw.com.txt b/data/GrabberConfig/pymotw.com.txt new file mode 100644 index 00000000..0f9db08e --- /dev/null +++ b/data/GrabberConfig/pymotw.com.txt @@ -0,0 +1,5 @@ +body: //div[starts-with(@id, 'module-')] + +test_url: https://pymotw.com/3/configparser/ +test_url: https://pymotw.com/3/shlex/ +test_url: https://pymotw.com/3/sys/ diff --git a/data/GrabberConfig/qctimes.com.txt b/data/GrabberConfig/qctimes.com.txt new file mode 100644 index 00000000..3c3edfeb --- /dev/null +++ b/data/GrabberConfig/qctimes.com.txt @@ -0,0 +1,5 @@ +# this site seems to work OK in the web view, but only occasionally in the instapaper app itself. + +body: //div[@class='entry-content'] +author: //span[@class='byline'] +test_url: http://qctimes.com/news/local/woman-faces-perjury-charges-in-meth-case/article_83f4c470-956a-11e2-a921-001a4bcf887a.html
\ No newline at end of file diff --git a/data/GrabberConfig/quantamagazine.org.txt b/data/GrabberConfig/quantamagazine.org.txt new file mode 100644 index 00000000..9bd3901f --- /dev/null +++ b/data/GrabberConfig/quantamagazine.org.txt @@ -0,0 +1,4 @@ +body: //div[contains(@class, 'post__content__section')] + +test_url: https://www.quantamagazine.org/a-path-less-taken-to-the-peak-of-the-math-world-20170627/ +test_contains: Mathematicians are interested in the following diff --git a/data/GrabberConfig/quantumdiaries.org.txt b/data/GrabberConfig/quantumdiaries.org.txt new file mode 100644 index 00000000..c17fb312 --- /dev/null +++ b/data/GrabberConfig/quantumdiaries.org.txt @@ -0,0 +1,14 @@ +title: //div[contains(@class, "hentry")]/h3 + +author: //div[contains(@class, "hentry")]/h2[contains(@class, "author_bio")] + +date: substring-before(substring-after(normalize-space(//p[contains(@class, "postmetadata")]/small), "was posted on "), " and is filed under") + +body: //div[contains(@class, "entry")] + +strip_id_or_class: addtoany_share_save_container +strip_id_or_class: postmetadata +strip_id_or_class: author_bio +strip_id_or_class: author_bio_2 +strip: //div[contains(@class, "hentry")]/h3 +test_url: http://www.quantumdiaries.org/2011/10/25/piling-up/
\ No newline at end of file diff --git a/data/GrabberConfig/quechoisir.org.txt b/data/GrabberConfig/quechoisir.org.txt new file mode 100644 index 00000000..a8bacdb7 --- /dev/null +++ b/data/GrabberConfig/quechoisir.org.txt @@ -0,0 +1,10 @@ +title: //h1[@id='titre'] +body://h2[@id="surtitre"]|//div[@id="ctn_introarticle"]|//div[@class="ctn_globalcontent"] + +strip_id_or_class: qc-container-main +strip_id_or_class: article_footer + +tidy: no +prune: no + +test_url: http://www.quechoisir.org/alimentation/securite-hygiene/actualite-acrylamide-un-contaminant-trop-present-dans-nos-assiettes diff --git a/data/GrabberConfig/queerty.com.txt b/data/GrabberConfig/queerty.com.txt new file mode 100644 index 00000000..fc7ab37f --- /dev/null +++ b/data/GrabberConfig/queerty.com.txt @@ -0,0 +1,3 @@ +body: //div[@class='copy'] +title: //h1[@class='hed'] +test_url: http://www.queerty.com/rawhide-radicals-meet-five-heroes-from-the-leather-community-20120302/
\ No newline at end of file diff --git a/data/GrabberConfig/questionablecontent.net.txt b/data/GrabberConfig/questionablecontent.net.txt new file mode 100644 index 00000000..21f0724b --- /dev/null +++ b/data/GrabberConfig/questionablecontent.net.txt @@ -0,0 +1,5 @@ +body: //div[@id='container']/div[contains(concat(' ',normalize-space(@class),' '),' row ')]/div[contains(concat(' ',normalize-space(@class),' '),' small-12 ')] +strip_id_or_class: comicnav +strip_id_or_class: small-3 +prune: no +test_url: https://questionablecontent.net/view.php?comic=3844 diff --git a/data/GrabberConfig/quora.com.txt b/data/GrabberConfig/quora.com.txt new file mode 100644 index 00000000..45f9f006 --- /dev/null +++ b/data/GrabberConfig/quora.com.txt @@ -0,0 +1,28 @@ +tidy: no +prune: no +body: //div[contains(concat(' ',normalize-space(@class),' '),' Answer ')] | //div[contains(concat(' ',normalize-space(@class),' '),' header ')] | //div[contains(concat(' ',normalize-space(@class),' '),' AnswerWikiArea ')] | //hr +#body: //div[contains(@class, 'main_col')] + +strip_id_or_class: AnswerFooter +strip_id_or_class: ActionBar +strip_id_or_class: hidden +strip_id_or_class: item_action_bar +strip_id_or_class: answer_voters +strip_id_or_class: question_topics +strip_id_or_class: answer_header_text +strip_id_or_class: editor_link +strip_id_or_class: view_tag +strip_id_or_class: include_details +strip_id_or_class: sig_edit +strip_id_or_class: profile_photo_img +strip_id_or_class: question_text_icons + +# insert hr between answers +find_string: <div class="Answer" +replace_string: <hr /><div class="Answer" + +test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life +test_contains: overcome emotional eating + +test_url: http://www.quora.com/What-is-the-greatest-illusion-in-life +test_contains: What is the greatest illusion in life? diff --git a/data/GrabberConfig/qz.com.txt b/data/GrabberConfig/qz.com.txt new file mode 100644 index 00000000..c918dce0 --- /dev/null +++ b/data/GrabberConfig/qz.com.txt @@ -0,0 +1,7 @@ +title: //h1 +body: //div[contains(concat(' ',normalize-space(@class),' '),' item-content ')] +strip: //div[@class='inline-ad'] +strip_id_or_class: item-meta +strip_id_or_class: article-aside + +test_url: https://qz.com/897790/cia-documents-from-attacking-pakistan-to-ufo-sightings-here-is-everything-the-declassified-cia-documents-have-on-india/
\ No newline at end of file diff --git a/data/GrabberConfig/rachelandrew.co.uk.txt b/data/GrabberConfig/rachelandrew.co.uk.txt new file mode 100644 index 00000000..fcece6ef --- /dev/null +++ b/data/GrabberConfig/rachelandrew.co.uk.txt @@ -0,0 +1,4 @@ +author: //a[contains(@class, "p-author")] +date: //time/@datetime + +test_url: https://www.rachelandrew.co.uk/archives/2018/10/04/the-way-we-talk-about-css/ diff --git a/data/GrabberConfig/racjonalista.pl.txt b/data/GrabberConfig/racjonalista.pl.txt new file mode 100644 index 00000000..19c719d4 --- /dev/null +++ b/data/GrabberConfig/racjonalista.pl.txt @@ -0,0 +1,5 @@ +author: /html/body/center/b +date: /html/body/table/tr[2]/td/i +single_page_link: //*[@id='oTxt']/table[3]/tr[2]/td/a[1] + +test_url: http://www.racjonalista.pl/kk.php/s,7214/q,Geneza.szubrawstwa
\ No newline at end of file diff --git a/data/GrabberConfig/radar.oreilly.com.txt b/data/GrabberConfig/radar.oreilly.com.txt new file mode 100644 index 00000000..fa66b815 --- /dev/null +++ b/data/GrabberConfig/radar.oreilly.com.txt @@ -0,0 +1,3 @@ +date://span[@class='date'] +body://div[@class='entry-body'] +test_url: http://radar.oreilly.com/2012/01/genome-cloud-digital-humanities-hadoop-world-strata.html
\ No newline at end of file diff --git a/data/GrabberConfig/radionz.co.nz.txt b/data/GrabberConfig/radionz.co.nz.txt new file mode 100644 index 00000000..2496ddab --- /dev/null +++ b/data/GrabberConfig/radionz.co.nz.txt @@ -0,0 +1,3 @@ +body: //div[@class='body'] +title: //div[@class='newsstory']/h2 +test_url: http://www.radionz.co.nz/news/stories/2010/07/18/12481029a86d
\ No newline at end of file diff --git a/data/GrabberConfig/randsinrepose.com.txt b/data/GrabberConfig/randsinrepose.com.txt new file mode 100644 index 00000000..6970a744 --- /dev/null +++ b/data/GrabberConfig/randsinrepose.com.txt @@ -0,0 +1,11 @@ +title: //div[@id='center-col']/h4 +author: substring-before(//title,'In') +date: substring-after(//div[@class='commenttext']/span,'#') +body: //div[@id='center-col'] +strip: //div[@id='center-col']/h4 +strip: //div[@class='graytext'] + +# Anthony Perez-Sanz 2012.3.14 +# Removed long gif from the end +strip: //img[@src='http://www.randsinrepose.com/spreader.gif'] +test_url: http://www.randsinrepose.com/archives/2012/03/13/hacking_is_important.html
\ No newline at end of file diff --git a/data/GrabberConfig/rasgolatente.es.txt b/data/GrabberConfig/rasgolatente.es.txt new file mode 100644 index 00000000..fcdbbd97 --- /dev/null +++ b/data/GrabberConfig/rasgolatente.es.txt @@ -0,0 +1,6 @@ +title: //div[@class='entry-title'] +author: //div[@class='entry-author'] +date: //div[@class='entry-time'] +body: //div[@class='entry-content'] + +test_url: http://rasgolatente.es/estupidez-psicologia-estupidos/ diff --git a/data/GrabberConfig/readability.com.txt b/data/GrabberConfig/readability.com.txt new file mode 100644 index 00000000..2d5aba76 --- /dev/null +++ b/data/GrabberConfig/readability.com.txt @@ -0,0 +1,3 @@ +single_page_link: //link[@rel='canonical']/@href + +test_url: http://www.readability.com/read?url=http://feeds.gawker.com/~r/lifehacker/full/~3/jaxAjSay_Rw/add-a-rain-gutter-to-a-picnic-table-for-a-built+in-drink-cooler
\ No newline at end of file diff --git a/data/GrabberConfig/real.gr.txt b/data/GrabberConfig/real.gr.txt new file mode 100644 index 00000000..ce0a3c43 --- /dev/null +++ b/data/GrabberConfig/real.gr.txt @@ -0,0 +1,5 @@ +body: //div[contains(@class, 'article-photo-wrapper')] +prune: no + +test_url: http://www.real.gr/DefaultArthro.aspx?page=arthro&id=360962&catID=1 +test_contains: Επισήμως το αποψινό υπουργικό diff --git a/data/GrabberConfig/rebooti.com.txt b/data/GrabberConfig/rebooti.com.txt new file mode 100644 index 00000000..fb68b26b --- /dev/null +++ b/data/GrabberConfig/rebooti.com.txt @@ -0,0 +1,10 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Frebooti.com%2F2015%2F11%2Fsamsung-gear-vr-sold-out-online-at-amazon-and-best-buy%2F + +body: //div[@id='content-anchor-inner'] + +strip_id_or_class: adsbygoogle +strip: //ins + +test_url: http://rebooti.com/2015/11/samsung-gear-vr-sold-out-online-at-amazon-and-best-buy/ diff --git a/data/GrabberConfig/recode.net.txt b/data/GrabberConfig/recode.net.txt new file mode 100644 index 00000000..c5581206 --- /dev/null +++ b/data/GrabberConfig/recode.net.txt @@ -0,0 +1,4 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' c-entry-content ')] + +test_url: https://www.recode.net/2017/6/7/15749994/alphabet-project-wing-autonomous-drone-tracking-faa-nasa +test_url: https://www.recode.net/2017/6/10/15774936/apple-podcast-analytics-wwdc diff --git a/data/GrabberConfig/redbull.com.txt b/data/GrabberConfig/redbull.com.txt new file mode 100644 index 00000000..1cb8c35e --- /dev/null +++ b/data/GrabberConfig/redbull.com.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.redbull.com%2Fpl-pl%2Flutto-lento-ciemne-sprawy-wywiad + +body: //div[contains(concat(' ',normalize-space(@class),' '),' story-page__body ')] +test_url: https://www.redbull.com/pl-pl/lutto-lento-ciemne-sprawy-wywiad +test_contains: I zupełnie nikt nie zareagował diff --git a/data/GrabberConfig/reddit.com.txt b/data/GrabberConfig/reddit.com.txt new file mode 100644 index 00000000..5d358e0c --- /dev/null +++ b/data/GrabberConfig/reddit.com.txt @@ -0,0 +1,22 @@ +# This setup grabs the text from a Reddit self post. It ignores all comments etc. + +title: //p[@class="title"]/a/text() + +author: //div[@class="top-matter"]/p[contains(concat(' ',normalize-space(@class),' '),'tagline ')]/a/text() + +# this doesn't work for some reason...? +date: //p[@class="tagline"]//@datetime + +body: //div[contains(concat(' ',normalize-space(@class),' '),' expando ')] + +strip_id_or_class: tagline +strip_id_or_class: unvotable-message +strip_id_or_class: buttons + +# follow the posted link (unless it's a self post - relative URL, no http://) +single_page_link: //p[@class="title"]/a[contains(@href, 'http://')] + +test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/ +test_url: http://www.reddit.com/r/worldnews/comments/1as37r/twelve_north_korean_soldiers_attempting_to_defect/ +test_url: http://www.reddit.com/r/WritingPrompts/comments/2786lw/wp_in_a_world_where_puns_are_illegal_one_man/chybk8e +test_url: https://www.reddit.com/r/LinuxActionShow/comments/1fccny/arch_linux_survival_guide/ diff --git a/data/GrabberConfig/redeszone.net.txt b/data/GrabberConfig/redeszone.net.txt new file mode 100644 index 00000000..75719955 --- /dev/null +++ b/data/GrabberConfig/redeszone.net.txt @@ -0,0 +1,15 @@ +title: //h2[@class='entry-title'] +body: //article +author: //span[contains(concat(' ',normalize-space(@class),' '),' author ')] +date: //time + +strip: //iframe +strip: //img[@id='amazon-track'] +strip: //div[@class='featured-image'] +strip: //img[@id='gaz-track'] +strip: //aside +strip_id_or_class: NEWS_BODY +strip_id_or_class: NEWS_RELATED +strip_id_or_class: social-full + +test_url: https://www.redeszone.net/2017/02/07/masmovil-implementa-redes-cgn-carrier-grade-nat-conoce-todos-los-problemas-tendras/ diff --git a/data/GrabberConfig/redmondpie.com.txt b/data/GrabberConfig/redmondpie.com.txt new file mode 100644 index 00000000..66cc1707 --- /dev/null +++ b/data/GrabberConfig/redmondpie.com.txt @@ -0,0 +1,13 @@ +title: //div[@class='posthead']//h2 +body: //div[contains(@class, 'postcontent') or @class='posthead'] +author: //div[@class='posthead']//a[@rel='author'] + +strip: //div[@class='posthead']//h2 +replace_string(>Advertisements</div>): ></div> +replace_string(<p>You can follow us on): <p style="display:none;"> +strip_id_or_class: likeThisPost + +prune: no +tidy: no + +test_url: http://www.redmondpie.com/how-to-play-music-directly-from-home-screen-folders-on-iphone/
\ No newline at end of file diff --git a/data/GrabberConfig/reflets.info.txt b/data/GrabberConfig/reflets.info.txt new file mode 100644 index 00000000..7c0149cb --- /dev/null +++ b/data/GrabberConfig/reflets.info.txt @@ -0,0 +1,17 @@ +title: //h1 +author: //a[@class='author'] + +# wallabag-specific login directives (not supported in FTR) +requires_login: yes + +login_uri: https://reflets.info/users/sign_in +login_username_field: user[email] +login_password_field: user[password] +login_extra_fields: authenticity_token=@=xpath('//input[@name="authenticity_token"]', request_html('https://reflets.info/users/sign_in')) +login_extra_fields: utf8=✓ +login_extra_fields: user[remember_me]=1 +login_extra_fields: commit=Connexion + +not_logged_in_xpath: //div[contains(@class, "warning")] + +test_url: https://reflets.info/articles/monetiser-ses-donnees-personnelles-l-arbre-qui-cache-la-foret diff --git a/data/GrabberConfig/renenekuda.cz.txt b/data/GrabberConfig/renenekuda.cz.txt new file mode 100644 index 00000000..a5361fd0 --- /dev/null +++ b/data/GrabberConfig/renenekuda.cz.txt @@ -0,0 +1,3 @@ +title: //*[@class='entry-title'] +body: //div[@class='entry-content'] +test_url: http://www.renenekuda.cz/recept-na-produktivitu/
\ No newline at end of file diff --git a/data/GrabberConfig/reportermagazin.cz.txt b/data/GrabberConfig/reportermagazin.cz.txt new file mode 100644 index 00000000..87ae4f00 --- /dev/null +++ b/data/GrabberConfig/reportermagazin.cz.txt @@ -0,0 +1,5 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-detail__contentArticle ')] +title: //h1[contains(concat(' ',normalize-space(@class),' '),' article-detail__title ')] +author: //div[contains(concat(' ',normalize-space(@class),' '),' article-detail__contentArticle ')] +date: //div[contains(concat(' ',normalize-space(@class),' '),' article-header-big__authorDate ')] +test_url: https://reportermagazin.cz/a/iJ3Hz/prosime-boha-aby-meli-cesti-lekari-dobry-zivot diff --git a/data/GrabberConfig/resume.se.txt b/data/GrabberConfig/resume.se.txt new file mode 100644 index 00000000..4f41ce1c --- /dev/null +++ b/data/GrabberConfig/resume.se.txt @@ -0,0 +1,6 @@ +body: //div[contains(@class, 'article-body')] + +prune: no + +test_url: https://www.resume.se/nyheter/artiklar/2017/09/19/blossa-lanserar-arets-glogg-via-livestream--hamtar-inspiration-fran-indien +test_url: http://www.resume.se/rss-nyheter diff --git a/data/GrabberConfig/retractionwatch.com.txt b/data/GrabberConfig/retractionwatch.com.txt new file mode 100644 index 00000000..ca2e6c79 --- /dev/null +++ b/data/GrabberConfig/retractionwatch.com.txt @@ -0,0 +1,3 @@ +title: //h1 +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: https://retractionwatch.com/2018/07/30/how-institutions-gaslight-whistleblowers-and-what-can-be-done/ diff --git a/data/GrabberConfig/reuters.com.txt b/data/GrabberConfig/reuters.com.txt new file mode 100644 index 00000000..1b2f3d03 --- /dev/null +++ b/data/GrabberConfig/reuters.com.txt @@ -0,0 +1,13 @@ +title: //h1[@class='headline3'] +author: substring-after(//p[@class="byline"], 'By ') +date: //meta[@name="REVISION_DATE"]/@content +body: //div[@id='articleImage' or @id='frame_fd1fade'] | //span[@id='article-text'] | //div[@class='pageNavigation'] +body: //article[@itemtype="http://schema.org/NewsArticle"] +strip: //li[@class='next'] +strip: //span[@class='articleLocation'] +prune: no +tidy: no + +test_url: http://www.reuters.com/article/2011/04/08/us-ivorycoast-killings-idUSTRE73732A20110408 +test_url: http://www.reuters.com/investigates/special-report/usa-taser-experts/ +test_contains: Among the settlements examined by Reuters diff --git a/data/GrabberConfig/revue-farouest.fr.txt b/data/GrabberConfig/revue-farouest.fr.txt new file mode 100644 index 00000000..cf0c670f --- /dev/null +++ b/data/GrabberConfig/revue-farouest.fr.txt @@ -0,0 +1,41 @@ + +replace_string(lang="en"): lang="fr" + +#----------------------------------------------------------- +# for text articles +#----------------------------------------------------------- +test_url: https://www.revue-farouest.fr/appelez-moi-ernest/ + +body: //div[@class='ep_intro'] | //div[@class='ep_wisy'] + +author: //div[@class='ep_auteur_box']//a + +strip_id_or_class: rcp_paid_only +strip_id_or_class: BouttonPartageEpisode + +#----------------------------------------------------------- +# for video articles: +#----------------------------------------------------------- +test_url: https://www.revue-farouest.fr/videos/manon-bril-youtube-antique/ + +body: //iframe | //div[@id='info_article'] + +author: //div[@class='AuteurDateVideo']//a + +prune: no +strip: //script + +strip_id_or_class: party_payant +strip_id_or_class: pop-register + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' rcp_paid_only ')|contains(concat(' ',normalize-space(@class),' '),' party_payant ')] +login_uri: https://www.revue-farouest.fr/wp-admin/admin-ajax.php +login_extra_fields: action=ajaxlogin +login_username_field: username +login_password_field: password +login_extra_fields: security=@=xpath('//input[@name="security"]', request_html('https://www.revue-farouest.fr/')) + diff --git a/data/GrabberConfig/rework.withgoogle.com.txt b/data/GrabberConfig/rework.withgoogle.com.txt new file mode 100644 index 00000000..39abb83a --- /dev/null +++ b/data/GrabberConfig/rework.withgoogle.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Frework.withgoogle.com%2Fblog%2Fsuperpowers-at-work-okrs%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' blog-post__container ')] +test_url: https://rework.withgoogle.com/blog/superpowers-at-work-okrs/ diff --git a/data/GrabberConfig/rezeptwelt.de.txt b/data/GrabberConfig/rezeptwelt.de.txt new file mode 100644 index 00000000..2093573b --- /dev/null +++ b/data/GrabberConfig/rezeptwelt.de.txt @@ -0,0 +1,5 @@ +body: //div[@class='step-content'] | //div[@class='global-active ingredients-box'] +title: //div[@class='step-1-container'] + +tidy: no +test_url: http://www.rezeptwelt.de/backen-herzhaft-rezepte/w%C3%BCrstchen-schlangen/530372
\ No newline at end of file diff --git a/data/GrabberConfig/rhenus.com.txt b/data/GrabberConfig/rhenus.com.txt new file mode 100644 index 00000000..ed2c2933 --- /dev/null +++ b/data/GrabberConfig/rhenus.com.txt @@ -0,0 +1,5 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' news-single-item ')] +title: //div[contains(concat(' ',normalize-space(@class),' '),' news-single-item ')]//h1 +strip_id_or_class: news-single-backlink + +test_url: http://www.rhenus.com/en/infocenter/press/single-news/article/rhenus-sells-freight-forwarding-centre-in-fellbach-to-raben/ diff --git a/data/GrabberConfig/robertsspaceindustries.com.txt b/data/GrabberConfig/robertsspaceindustries.com.txt new file mode 100644 index 00000000..b0b90fb7 --- /dev/null +++ b/data/GrabberConfig/robertsspaceindustries.com.txt @@ -0,0 +1,4 @@ +strip_id_or_class: 'sharedaddy' +strip_id_or_class: 'respond' +strip_id_or_class: 'meta' +test_url: http://www.robertsspaceindustries.com/news-update-ai-pilots/
\ No newline at end of file diff --git a/data/GrabberConfig/robots.thoughtbot.com.txt b/data/GrabberConfig/robots.thoughtbot.com.txt new file mode 100644 index 00000000..da5b7bd8 --- /dev/null +++ b/data/GrabberConfig/robots.thoughtbot.com.txt @@ -0,0 +1,5 @@ +body: //section[@class='post text'] +title: //h1[@class='title'] +date: //p[@class='post-date'] +strip: //section[@class='meta-info'] +test_url: http://robots.thoughtbot.com/post/32455387133/four-phase-test
\ No newline at end of file diff --git a/data/GrabberConfig/rockpapershotgun.com.txt b/data/GrabberConfig/rockpapershotgun.com.txt new file mode 100644 index 00000000..83342cb7 --- /dev/null +++ b/data/GrabberConfig/rockpapershotgun.com.txt @@ -0,0 +1,10 @@ +title: //h2 + +strip: //div[ contains(@class, 'respond') ] | //h2 | //h1 +strip_id_or_class: social +strip_id_or_class: dd_post_share + +date: substring-after(//p[@class='info'], ' on ') + +author: //p[@class='info']//a +test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/ diff --git a/data/GrabberConfig/rodrigo.sharpcube.com.txt b/data/GrabberConfig/rodrigo.sharpcube.com.txt new file mode 100644 index 00000000..eef8b11c --- /dev/null +++ b/data/GrabberConfig/rodrigo.sharpcube.com.txt @@ -0,0 +1,7 @@ +author: //article/header/span[@class='author'] +title://article/header/h1 +body: //article +strip: //article/header +strip: //article/p[@class='metadata'] +footnotes: yes +test_url: http://rodrigo.sharpcube.com/2010/06/20/using-and-sharing-a-vpn-connection-on-your-mac/
\ No newline at end of file diff --git a/data/GrabberConfig/rogerebert.com.txt b/data/GrabberConfig/rogerebert.com.txt new file mode 100644 index 00000000..da215109 --- /dev/null +++ b/data/GrabberConfig/rogerebert.com.txt @@ -0,0 +1,8 @@ +title: substring-before(//title,':') +author: substring-after(substring-before(//div[@class='text']/b,'/'),'BY') + +body: //div[@class='text'] + +strip: //a[contains(@href,'printart')] +strip_id_or_class: enlarge_photo +test_url: http://rogerebert.com/apps/pbcs.dll/article?AID=/20120411/REVIEWS/120419998/1005/GLOSSARY
\ No newline at end of file diff --git a/data/GrabberConfig/rollingstone.com.txt b/data/GrabberConfig/rollingstone.com.txt new file mode 100644 index 00000000..9ad6c24c --- /dev/null +++ b/data/GrabberConfig/rollingstone.com.txt @@ -0,0 +1,15 @@ +body: //div[@id='collection-items-container'] +body: //div[contains(@class, 'article-content')] + +prune: no + +next_page_link: //link[@rel="next"]/@href +if_page_contains: //a[contains(@class, 'pagination-collection')] + +strip_id_or_class: module-more-news +strip_id_or_class: module-related + +test_url: http://www.rollingstone.com/politics/news/the-plastic-bag-wars-20110725 +# mult-page article +test_url: http://www.rollingstone.com/culture/pictures/beyond-gypsy-blancharde-when-mothers-harm-their-kids-w437364 +test_contains: Former chemist and mother of three diff --git a/data/GrabberConfig/rom-game.fr.txt b/data/GrabberConfig/rom-game.fr.txt new file mode 100644 index 00000000..3d97823d --- /dev/null +++ b/data/GrabberConfig/rom-game.fr.txt @@ -0,0 +1,4 @@ +author: //div[@id="main"]//a[contains(@href, "/auteurs/")] +date: //meta[@itemprop="datePublished"]/@content + +test_url: https://www.rom-game.fr/news/3371-Goldeneye+007+devient+Goldeneye+25.html diff --git a/data/GrabberConfig/root.cz.txt b/data/GrabberConfig/root.cz.txt new file mode 100644 index 00000000..7c1c7642 --- /dev/null +++ b/data/GrabberConfig/root.cz.txt @@ -0,0 +1,6 @@ +body: //div[@class='urs'] | //div[@itemprop='articleBody'] + +strip_id_or_class: perex__impressum + +test_url: http://www.root.cz/zpravicky/google-testuje-moznost-vyzkouset-mobilni-aplikace-pred-instalaci/ +test_url: http://www.root.cz/clanky/let-s-encrypt-otevrel-branu-ve-verejne-bete-muze-mit-certifikat-kazdy/
\ No newline at end of file diff --git a/data/GrabberConfig/rottentomatoes.com.txt b/data/GrabberConfig/rottentomatoes.com.txt new file mode 100644 index 00000000..ef327691 --- /dev/null +++ b/data/GrabberConfig/rottentomatoes.com.txt @@ -0,0 +1,11 @@ +body: //div[@class='movie_content_area'] +strip_id_or_class: tomatometer_bar_help +strip_id_or_class: critic-links +strip_id_or_class: top-critics-numbers +strip_id_or_class: fan_side +strip_id_or_class: fblike +strip_id_or_class: rating_widget +strip_id_or_class: friend_reviews +prune: no + +test_url: http://www.rottentomatoes.com/m/thor/
\ No newline at end of file diff --git a/data/GrabberConfig/roughtype.com.txt b/data/GrabberConfig/roughtype.com.txt new file mode 100644 index 00000000..a012a67d --- /dev/null +++ b/data/GrabberConfig/roughtype.com.txt @@ -0,0 +1,5 @@ +body: //div[@class='content'] +strip: //p[@class='postmeta']/following::* +strip: //p[@class='postmeta'] +strip: //p[@align='left'] +test_url: http://www.roughtype.com/archives/2012/01/power_to_the_da.php
\ No newline at end of file diff --git a/data/GrabberConfig/roy.gbiv.com.txt b/data/GrabberConfig/roy.gbiv.com.txt new file mode 100644 index 00000000..6ff03de8 --- /dev/null +++ b/data/GrabberConfig/roy.gbiv.com.txt @@ -0,0 +1,2 @@ +strip_comments: no +test_url: http://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven
\ No newline at end of file diff --git a/data/GrabberConfig/rpgsite.net.txt b/data/GrabberConfig/rpgsite.net.txt new file mode 100644 index 00000000..9ddbf0f2 --- /dev/null +++ b/data/GrabberConfig/rpgsite.net.txt @@ -0,0 +1,4 @@ +body: //div[@id='news-text'] +prune: no +test_url: http://www.rpgsite.net/news/1964-tetsuya-nomura-says-hell-soon-show-the-future-of-final-fantasy +test_url: http://www.rpgsite.net/news/1965-new-atelier-totori-plus-screens-and-artwork
\ No newline at end of file diff --git a/data/GrabberConfig/rubysfera.pl.txt b/data/GrabberConfig/rubysfera.pl.txt new file mode 100644 index 00000000..d9d9a431 --- /dev/null +++ b/data/GrabberConfig/rubysfera.pl.txt @@ -0,0 +1,9 @@ +author: //div[contains(@class, 'author_text')]/h4/text() +date: //li[@class='date'] + +# stripping excessive tags +strip: //div[contains(@class, 'entry_meta')] +strip: //div[contains(@class, 'single_meta')] +strip: //br[contains(@class, 'clear')] +strip: //h3[contains(., 'Komentarz')] +test_url: http://rubysfera.pl/2011/09/10-porad-o-rvm/
\ No newline at end of file diff --git a/data/GrabberConfig/ruhlman.com.txt b/data/GrabberConfig/ruhlman.com.txt new file mode 100644 index 00000000..e54b0f0e --- /dev/null +++ b/data/GrabberConfig/ruhlman.com.txt @@ -0,0 +1,6 @@ +title: //h1[@class='entry-title'] +author: ///span[@class='author vcard'] +date: //abbr[@class='published'] +body: //div[@class='entry-content'] + +test_url: http://ruhlman.com/2009/05/cookbooks-that-teach/
\ No newline at end of file diff --git a/data/GrabberConfig/saadaalnews.net.txt b/data/GrabberConfig/saadaalnews.net.txt new file mode 100644 index 00000000..b9ce04e5 --- /dev/null +++ b/data/GrabberConfig/saadaalnews.net.txt @@ -0,0 +1,11 @@ +body: //div[contains(@class, 'section-content-left')] + +strip_id_or_class: related +strip_id_or_class: nocontent +strip_id_or_class: comment +strip_id_or_class: widget +strip_id_or_class: respond +strip: //h3[.='Comments'] +strip: //p[.='comments'] + +test_url: http://saadaalnews.net/?p=42624 diff --git a/data/GrabberConfig/salon.com.txt b/data/GrabberConfig/salon.com.txt new file mode 100644 index 00000000..d669fdb3 --- /dev/null +++ b/data/GrabberConfig/salon.com.txt @@ -0,0 +1,11 @@ +title: //meta[@property='og:title']/@content +author: (//span[@class="byline"]/a)[1] +date: //span[contains(@class, "toLocalTime")] +body: (//div[contains(@class, "articleInner")]//img[contains(@src, 'media.salon.com') and contains(@src, '460x')])[1] | //div[contains(@class, "articleContent") or contains(@class, "writerMeta")] + +prune: no + +# deal with singleton links +single_page_link: (//h1/a[contains(@href, '/singleton')])[1] + +test_url: http://www.salon.com/2011/10/25/occupying_the_rust_belt/ diff --git a/data/GrabberConfig/salzburg.com.txt b/data/GrabberConfig/salzburg.com.txt new file mode 100644 index 00000000..464f99f1 --- /dev/null +++ b/data/GrabberConfig/salzburg.com.txt @@ -0,0 +1,6 @@ +body: //p[@class='teaser1 darkgrey myriad'] +move_into(//p[@class='teaser1 darkgrey myriad']): //div[@class='artikel clear'] +strip: //div[@class='hidden'] +strip: //div[@id='article_related_source'] + +test_url: http://www.salzburg.com/nachrichten/oesterreich/politik/sn/artikel/deutliche-nachbesserungen-bei-lehrerdienstrecht-19469/
\ No newline at end of file diff --git a/data/GrabberConfig/sanpedrosun.com.txt b/data/GrabberConfig/sanpedrosun.com.txt new file mode 100644 index 00000000..3f19cced --- /dev/null +++ b/data/GrabberConfig/sanpedrosun.com.txt @@ -0,0 +1,10 @@ +title: //div[contains(@class, 'post')]//h1 +date: //div[contains(@class, 'post')]//h6 +body: //div[contains(@class, 'entry')] +strip_id_or_class: post_stats +strip_id_or_class: related-posts +strip_id_or_class: after_story +prune: no + +test_url: http://www.sanpedrosun.com/community-and-society/2013/06/05/little-angelspre-school-talent-show/ +test_url: http://www.sanpedrosun.com/feed/
\ No newline at end of file diff --git a/data/GrabberConfig/sargasso.nl.txt b/data/GrabberConfig/sargasso.nl.txt new file mode 100644 index 00000000..fcbadd85 --- /dev/null +++ b/data/GrabberConfig/sargasso.nl.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fsargasso.nl%2Fquote-du-jour-gibraltar%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: http://sargasso.nl/quote-du-jour-gibraltar/
\ No newline at end of file diff --git a/data/GrabberConfig/saveyourself.ca.txt b/data/GrabberConfig/saveyourself.ca.txt new file mode 100644 index 00000000..5a5605d9 --- /dev/null +++ b/data/GrabberConfig/saveyourself.ca.txt @@ -0,0 +1,25 @@ +title://h1 + +# my section divs seem to interfere with the Instapaper parser, so I ditch 'em +dissolve://div[contains(@class, 'section')] + +#these don't seem to be necessary, but just in case +strip_id_or_class:'masthead' +strip_id_or_class:'footer' + +#again, Instapaper seems to understand where my content is, but just in case +body://div[@id='content'] + +# in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing +strip_id_or_class:'screen-only' +strip_id_or_class:'no-print' + +#other misc removals and simplifications +strip_id_or_class:'popup' +strip_id_or_class:'ZoomSpin' + +#I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes +wrap_in(blockquote)://div[contains(@class, 'sidebar')] +wrap_in(blockquote)://div[contains(@class, 'meta')] +wrap_in(blockquote)://p[contains(@class, 'meta')] +test_url: http://saveyourself.ca/tutorials/low-back-pain.php
\ No newline at end of file diff --git a/data/GrabberConfig/sayidaty.net.txt b/data/GrabberConfig/sayidaty.net.txt new file mode 100644 index 00000000..2d9f1884 --- /dev/null +++ b/data/GrabberConfig/sayidaty.net.txt @@ -0,0 +1,4 @@ +date: //meta[@property='article:published_time']/@content +body: (//div[contains(@class, 'article-slider')]//img)[1] | //div[contains(@class, 'bottom-article-con')] + +test_url: http://www.sayidaty.net/taxonomy/term/10/all/feed
\ No newline at end of file diff --git a/data/GrabberConfig/sbnation.com.txt b/data/GrabberConfig/sbnation.com.txt new file mode 100644 index 00000000..41b36755 --- /dev/null +++ b/data/GrabberConfig/sbnation.com.txt @@ -0,0 +1,28 @@ +title: //h1[@id='stream_title'] + +# Author and date don't work +author: //div[@class='byline'] +date: //div[@class='date-stamp'] + +body: //div[@class='node-article'] + +strip_id_or_class: fb-like-box +strip_id_or_class: stream-fb-like +strip_id_or_class: social-meta +strip_id_or_class: social-spoken +strip_id_or_class: twitter-share-button +strip_id_or_class: twitter-follow-button +strip_id_or_class: spinner_node_list +strip_id_or_class: node-sort-link +strip_id_or_class: stream_title +strip_id_or_class: stream_summary +strip_id_or_class: update-count-container +strip_id_or_class: major-updates +strip_id_or_class: newsletter-slide +strip_id_or_class: author-mini-profile +strip_id_or_class: byline +strip_id_or_class: header +strip_id_or_class: footer + +# Works, but "no text" errors on: http://www.sbnation.com/nba/2012/3/9/2856780/nba-scores-dwight-howard-bulls-magic-mavs-suns +test_url: http://www.sbnation.com/nba/2012/3/13/2867226/dwight-howard-trade-rumors-2012-faq-orlando-magic
\ No newline at end of file diff --git a/data/GrabberConfig/schneier.com.txt b/data/GrabberConfig/schneier.com.txt new file mode 100644 index 00000000..0074a86a --- /dev/null +++ b/data/GrabberConfig/schneier.com.txt @@ -0,0 +1,25 @@ +author: //p[@class='mastname'] + +body: //div[@class='indivbody'] +date: //div[@class='indivbody']/h2[1] + +# Remove blog title. Specify first occurrence in case h1 is used in article +strip: //div[@class='indivbody']/h1[1] + +# Remove blog description (the first p element) +strip: //div[@class='indivbody']/p[1] + +# Remove navigation (second p element) +strip: //div[@class='indivbody']/p[2] + +# Remove duplicate of article title. Specify first occurrence in case h3 is used in article +strip: //div[@class='indivbody']/h3[1] + +# Remove publishing date, it's extracted by rule above +strip: //div[@class='indivbody']/h2[1] + +# Remove duplicate of date at end, and newsletter signup +strip: //p[@class='posted'] + +# Leave date at top +test_url: http://www.schneier.com/blog/archives/2010/12/security_in_202.html
\ No newline at end of file diff --git a/data/GrabberConfig/scienceblogs.de.txt b/data/GrabberConfig/scienceblogs.de.txt new file mode 100644 index 00000000..b0dec3d2 --- /dev/null +++ b/data/GrabberConfig/scienceblogs.de.txt @@ -0,0 +1,12 @@ +single_page_link: //div[@class='c2c1']/div[@class='toptheme further line']//ul//li/a + +author: //div[@class='details clear']//a[@class='hi'] +body: //div[@class='title'] +strip: //p[@class='entrypagination'] +strip: //p[@class='details_top'] +date: //p[@class='details_top'] +title: //div[@class='title']/h1 +strip: //p[@class='details'] +strip: //p[@class='details_bottom'] + +test_url: http://www.scienceblogs.de/astrodicticum-simplex/2011/10/weltuntergang-reloaded-das-jungste-gericht-findet-am-21-oktober-statt.php
\ No newline at end of file diff --git a/data/GrabberConfig/scienceticker.info.txt b/data/GrabberConfig/scienceticker.info.txt new file mode 100644 index 00000000..2a06f734 --- /dev/null +++ b/data/GrabberConfig/scienceticker.info.txt @@ -0,0 +1,11 @@ +body: //div[@class='post'] +title: //h1[@id='singlePageTitle'] +date: substring-before(//small,'• Rubrik') + +strip: //div[@class='post-ratings'] +strip: //div[@class='post-ratings-loading'] +strip: //a[@title='Empfehlen Sie den Text weiter!'] +strip: //a[@title='Drucken'] +strip: //div[@class='share'] + +test_url: http://www.scienceticker.info/2011/11/24/forscher-finden-gedachtnismolekul/
\ No newline at end of file diff --git a/data/GrabberConfig/scilogs.de.txt b/data/GrabberConfig/scilogs.de.txt new file mode 100644 index 00000000..b24d7844 --- /dev/null +++ b/data/GrabberConfig/scilogs.de.txt @@ -0,0 +1,15 @@ +title: //h1 +author: //div[@class='date']/a +date: substring-after(//div[@class='date'], ',') +body: //div[@class='entrybody'] + +strip_id_or_class: socialshareprivacy +strip: //div[@class='entrybody']/br[1] + +# Strip related articles +# 'p'-Tag strips 'Ähnliche Artikel: ' (<br> tags become <p>) +strip: //div[@class='entrybody']/p[last()] +strip: //div[@class='entrybody']/ul[last()] + +convert_double_br_tags: yes +test_url: http://www.scilogs.de/wblogs/blog/formbar/fusion/2012-10-08/rundgang-durch-deutschlands-gr-tes-fusionsexperiment
\ No newline at end of file diff --git a/data/GrabberConfig/scinfolex.com.txt b/data/GrabberConfig/scinfolex.com.txt new file mode 100644 index 00000000..14fe37ce --- /dev/null +++ b/data/GrabberConfig/scinfolex.com.txt @@ -0,0 +1,3 @@ +author: //div[@class="entry-meta"]//a[contains(@class, "url")] + +test_url: https://scinfolex.com/2018/09/15/la-directive-copyright-nest-pas-une-defaite-pour-linternet-libre-et-ouvert/ diff --git a/data/GrabberConfig/scnsrc.me.txt b/data/GrabberConfig/scnsrc.me.txt new file mode 100644 index 00000000..788fbdbd --- /dev/null +++ b/data/GrabberConfig/scnsrc.me.txt @@ -0,0 +1,4 @@ +body: //div[@class='storycontent'] +tidy: no +test_url: http://www.scnsrc.me/assassins-creed-2016-720p-bluray-x264-sparks/ +replace_string(height="240"): width="240" height="240" diff --git a/data/GrabberConfig/scotusblog.com.txt b/data/GrabberConfig/scotusblog.com.txt new file mode 100644 index 00000000..8881bb45 --- /dev/null +++ b/data/GrabberConfig/scotusblog.com.txt @@ -0,0 +1,8 @@ +title: //title +author: //p[@id='author-name-role']/a +date: substring-after(//p[@class='time'],'Posted') +body: //div[@id='main'] +strip: //div[@id='author-info'] +strip: //div[@id='author-links'] +strip: //h1 +test_url: http://www.scotusblog.com/2012/04/shaken-baby-case-an-update/
\ No newline at end of file diff --git a/data/GrabberConfig/scripting.com.txt b/data/GrabberConfig/scripting.com.txt new file mode 100644 index 00000000..5fb0ee79 --- /dev/null +++ b/data/GrabberConfig/scripting.com.txt @@ -0,0 +1,8 @@ +strip: //a[starts-with(@href, '#')] +strip: //*[@class='storyByline'] +body: //*[@class='storyPageText']/.. +author: string('Dave Winer') +date: substring-before(substring-after(//*[@class='storyByline'], 'on'), 'at') +title: //h1 +footnotes: no +test_url: http://scripting.com/stories/2011/07/08/yeahImStillYawning.html
\ No newline at end of file diff --git a/data/GrabberConfig/searchenginejournal.com.txt b/data/GrabberConfig/searchenginejournal.com.txt new file mode 100644 index 00000000..dc98af3c --- /dev/null +++ b/data/GrabberConfig/searchenginejournal.com.txt @@ -0,0 +1,5 @@ +strip: //ul[contains(@id, "social")] +strip: //div[contains(@class, "ts-fab-wrapper")] +strip: //div[contains(@id, 'gpt-ad')] + +test_url: http://www.searchenginejournal.com/web-design-vs-seo-it-doesnt-make-much-sense/62294/ diff --git a/data/GrabberConfig/searchengineland.com.txt b/data/GrabberConfig/searchengineland.com.txt new file mode 100644 index 00000000..9ccc5898 --- /dev/null +++ b/data/GrabberConfig/searchengineland.com.txt @@ -0,0 +1,20 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' article ') and (contains(concat(' ',normalize-space(@class),' '),' clear '))] +title: //div[@class="storyBox"]/h1 +author: //a[@rel="author"] +date: substring-before(//span[@class="dateline"], 'by') + +#Removes related content but cleans up article text +strip: //h1 +strip: //p[@class="homeStory tdmSideInfo"] +strip: //div[@id="bylineShare"] +strip: //script +strip: //hr + +strip_id_or_class: homeStory +strip_id_or_class: authorpic +strip_id_or_class: insideComments +strip_id_or_class: authorbio +strip_id_or_class: gpt-ad-sel-cube +strip_id_or_class: smxTextAd + +test_url: http://searchengineland.com/googles-jaw-dropping-sponsored-post-campaign-for-chrome-106348
\ No newline at end of file diff --git a/data/GrabberConfig/seattletransitblog.com.txt b/data/GrabberConfig/seattletransitblog.com.txt new file mode 100644 index 00000000..5129c069 --- /dev/null +++ b/data/GrabberConfig/seattletransitblog.com.txt @@ -0,0 +1,5 @@ +title: //h3[@class="storytitle"] +date: //div[@class='meta'] +body: //div[@class='storycontent'] + +test_url: http://seattletransitblog.com/2012/06/19/times-st-louis-interested-in-buying-waterfront-streetcars/
\ No newline at end of file diff --git a/data/GrabberConfig/secouchermoinsbete.fr.txt b/data/GrabberConfig/secouchermoinsbete.fr.txt new file mode 100644 index 00000000..c66a1cdf --- /dev/null +++ b/data/GrabberConfig/secouchermoinsbete.fr.txt @@ -0,0 +1,9 @@ +prune: no +body: //article[@class='anecdote'] +strip: //article[@class='anecdote']/aside/div[@class='column-wrapper'] +strip: //article[@class='anecdote']/aside/div[@id='related-sources-wrapper']/div[@id='related'] + + +test_url: http://secouchermoinsbete.fr/62836-audi-a-ete-cree-par-un-ancien-chef-de-chez-mercedes +test_url: http://secouchermoinsbete.fr/62795-sous-l-eau-a-plus-de-10-metres-votre-sang-est-vert +test_url: http://secouchermoinsbete.fr/62663-l-invention-qui-pourrait-nettoyer-les-oceans-en-quelques-annees diff --git a/data/GrabberConfig/securityaffairs.co.txt b/data/GrabberConfig/securityaffairs.co.txt new file mode 100644 index 00000000..ee3d03aa --- /dev/null +++ b/data/GrabberConfig/securityaffairs.co.txt @@ -0,0 +1,4 @@ +# Generated by FiveFilters.org's web-based selection tool + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post_wrapper_inner ')]//div[contains(concat(' ',normalize-space(@class),' '),' post_inner_wrapper ')] +test_url: http://securityaffairs.co/wordpress/64590/malware/mac-proton-malware-supplychain-attack.html diff --git a/data/GrabberConfig/secushare.org.txt b/data/GrabberConfig/secushare.org.txt new file mode 100644 index 00000000..6d1429ff --- /dev/null +++ b/data/GrabberConfig/secushare.org.txt @@ -0,0 +1,12 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http://secushare.org/PGP + +strip: //div[@id='nav'] +strip: //div[@id='header'] + +date: //meta[@name='generated']/@content +author: //meta[@name='author']/@content + +body: //div[@id='content'] +test_url: http://secushare.org/PGP diff --git a/data/GrabberConfig/select.yeeyan.org.txt b/data/GrabberConfig/select.yeeyan.org.txt new file mode 100644 index 00000000..6e98b149 --- /dev/null +++ b/data/GrabberConfig/select.yeeyan.org.txt @@ -0,0 +1,18 @@ +# This filter is tested on: +# http://select.yeeyan.org/view/18312/332365 +# http://select.yeeyan.org/view/365295/333788 +# http://select.yeeyan.org/view/174464/332336 + +tidy:no +prune:no +title://h1 +author: //div[@class='sa_author']/span/a +date: substring-after(//div[@class='sa_author']/span/following-sibling::span, ':') +body: //div[@class='sa_left closetag'] +wrap_in(b)://div[@class='sa_abstract'] + +strip://ul[@class='sa_next clearfix'] +strip: //div[@class='sa_author'] +strip: //div[@class='sa_title_box'] + +test_url: http://select.yeeyan.org/view/258033/333481
\ No newline at end of file diff --git a/data/GrabberConfig/semiaccurate.com.txt b/data/GrabberConfig/semiaccurate.com.txt new file mode 100644 index 00000000..d3a24db3 --- /dev/null +++ b/data/GrabberConfig/semiaccurate.com.txt @@ -0,0 +1,13 @@ +tidy: no +prune: no + +title: //section[@id='content']//h1 +date: //span[@class='date'] +author: //span[@class='post-author']/a + +body: //div[@class='entry-content'] + +strip: //div[@id='ts-fab-below'] +strip: //div[@class='sharedaddy sd-sharing-enabled'] + +test_url: http://semiaccurate.com/2015/04/21/ubuntu-strips-phone-os-core/ diff --git a/data/GrabberConfig/seriouseats.com.txt b/data/GrabberConfig/seriouseats.com.txt new file mode 100644 index 00000000..5e633470 --- /dev/null +++ b/data/GrabberConfig/seriouseats.com.txt @@ -0,0 +1,15 @@ +body: //div[@id='content'] + +# clean up recipe pages +strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] + +#recipe pages +strip_id_or_class: "recipe-feedback" +strip_id_or_class: "comments" +strip_id_or_class: "procedure-number" +strip_id_or_class: "more-with-author" + +#slice +strip_id_or_class: "inner" + +test_url: http://www.seriouseats.com/recipes/2010/09/peking-duck-mandarin-pancakes-plum-sauce-recipe.html
\ No newline at end of file diff --git a/data/GrabberConfig/servethehome.com.txt b/data/GrabberConfig/servethehome.com.txt new file mode 100644 index 00000000..6f69dd90 --- /dev/null +++ b/data/GrabberConfig/servethehome.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.servethehome.com%2Ffirefox-is-eating-your-ssd-here-is-how-to-fix-it%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' the-content ')] +test_url: https://www.servethehome.com/firefox-is-eating-your-ssd-here-is-how-to-fix-it/
\ No newline at end of file diff --git a/data/GrabberConfig/sf.curbed.com.txt b/data/GrabberConfig/sf.curbed.com.txt new file mode 100644 index 00000000..4c10e9c7 --- /dev/null +++ b/data/GrabberConfig/sf.curbed.com.txt @@ -0,0 +1,7 @@ +title: //h1[@class='post-title'] +author: //div[@class='post-byline']/a +date: substring-before(//div[@class='post-byline'], ', by') + +body: //div[@class='post-body'] +dissolve: //noscript +test_url: http://sf.curbed.com/archives/2011/10/17/lower_haight_loft_would_really_really_really_like_a_buyer.php
\ No newline at end of file diff --git a/data/GrabberConfig/sf.eater.com.txt b/data/GrabberConfig/sf.eater.com.txt new file mode 100644 index 00000000..1e7c85a0 --- /dev/null +++ b/data/GrabberConfig/sf.eater.com.txt @@ -0,0 +1,7 @@ +title: //h1[@class="post-title"] +author: //div[@class="post-byline"]/a +date: substring-before(//div[@class='post-byline'], ', by') + +body: //div[@class='post-body'] +strip_id_or_class: post-kicker +test_url: http://sf.eater.com/archives/2012/05/22/nate_pollack_talks_about_the_american_grilled_cheese_kitchen_moving_into_the_mission.php
\ No newline at end of file diff --git a/data/GrabberConfig/sfgate.com.txt b/data/GrabberConfig/sfgate.com.txt new file mode 100644 index 00000000..54691122 --- /dev/null +++ b/data/GrabberConfig/sfgate.com.txt @@ -0,0 +1,12 @@ +title: /html/head/title + +body: //div[@id = 'articlecontent']/div[contains(@class, 'bodytext')] +author: //div[@class = 'articleheadings']/p[contains(@class,'author')]/span[@class = 'fn'] +date: //div[@class = 'articleheadings']/span[@class = 'updated'] +strip: //div[div[contains(@class, 'imgbox')]] + +body: //div[@class = 'blogitem'] +author: //p[@class="credit"]/span[@class="author"]/a[position() = 1] +date: //span[@class = 'pubdate'] + +test_url: http://www.sfgate.com/columnists/garchik/
\ No newline at end of file diff --git a/data/GrabberConfig/sfweekly.com.txt b/data/GrabberConfig/sfweekly.com.txt new file mode 100644 index 00000000..73c3017e --- /dev/null +++ b/data/GrabberConfig/sfweekly.com.txt @@ -0,0 +1,3 @@ +body: //div[contains(@class, 'content_body')] +strip_id_or_class: det_rel +test_url: http://www.sfweekly.com/2012-03-14/news/cia-lsd-wayne-ritchie-george-h-white-mk-ultra/
\ No newline at end of file diff --git a/data/GrabberConfig/shabayek.com.txt b/data/GrabberConfig/shabayek.com.txt new file mode 100644 index 00000000..59a2b0cc --- /dev/null +++ b/data/GrabberConfig/shabayek.com.txt @@ -0,0 +1,3 @@ +date: //span[@class='date'] +body: //div[@class='post_content'] +test_url: http://www.shabayek.com/blog/2016/12/04/%D8%AF%D8%B1%D9%88%D8%B3-%D8%A7%D9%84%D8%AA%D8%B3%D9%88%D9%8A%D9%82-%D9%85%D9%86-%D9%85%D8%B7%D8%B9%D9%85-%D9%81%D8%A7%D9%8A%D9%81-%D8%AC%D8%A7%D9%8A%D8%B2-%D9%84%D9%84%D8%A8%D8%B1%D8%AC%D8%B1-%D9%88/ diff --git a/data/GrabberConfig/shahrsakhtafzar.com.txt b/data/GrabberConfig/shahrsakhtafzar.com.txt new file mode 100644 index 00000000..88c8dec8 --- /dev/null +++ b/data/GrabberConfig/shahrsakhtafzar.com.txt @@ -0,0 +1,3 @@ +single_page_link: //option[contains(@value, 'showall=1')]/@value + +test_url: http://www.shahrsakhtafzar.com/fa/review/cpu/11652-amd-ryzen-7-1700x-review diff --git a/data/GrabberConfig/shawnblanc.net.txt b/data/GrabberConfig/shawnblanc.net.txt new file mode 100644 index 00000000..bd8438f7 --- /dev/null +++ b/data/GrabberConfig/shawnblanc.net.txt @@ -0,0 +1,11 @@ +title://*[@class='primary']/h1 +date: //*[@class='articledate'] +author: substring-before(substring-after(//*[@class='block first']/p,'2012 '),'.') +body: //div[@class='primary'] +footnotes: yes +strip: //*[@class='primary']/h1 +strip: //*[@class='articledate'] +strip: //*[@class='detailsarticle'] +strip: //*[@class='endnav'] +strip: //*[@class='endmeta'] +test_url: http://shawnblanc.net/2011/11/kindle-touch-review/
\ No newline at end of file diff --git a/data/GrabberConfig/shifteleven.com.txt b/data/GrabberConfig/shifteleven.com.txt new file mode 100644 index 00000000..43fd871d --- /dev/null +++ b/data/GrabberConfig/shifteleven.com.txt @@ -0,0 +1,6 @@ +body: //div[ @class='entry-content' ] + +strip: //div[ contains(@class, 'sharing') ] + +date: //div[ @class='entry-meta' ]/a +test_url: http://shifteleven.com/articles/2008/05/10/issue-tracking-git-ticgit
\ No newline at end of file diff --git a/data/GrabberConfig/shipilev.net.txt b/data/GrabberConfig/shipilev.net.txt new file mode 100644 index 00000000..cb8c9b9e --- /dev/null +++ b/data/GrabberConfig/shipilev.net.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fshipilev.net%2Fblog%2F2015%2Fblack-magic-method-dispatch%2F + +body: //div[@id='content'] +test_url: https://shipilev.net/blog/2015/black-magic-method-dispatch/ diff --git a/data/GrabberConfig/siecledigital.fr.txt b/data/GrabberConfig/siecledigital.fr.txt new file mode 100644 index 00000000..730f91ca --- /dev/null +++ b/data/GrabberConfig/siecledigital.fr.txt @@ -0,0 +1,10 @@ +title: //meta[@property="og:title"]/@content +body: //div[contains(concat(' ',normalize-space(@class),' '),' post-content ')] + +strip: //div[contains(concat(' ',normalize-space(@class),' '),' essb_links ')] + +prune: no +tidy: no + +test_url: https://siecledigital.fr/2018/08/10/twitter-condamne-france-ufc-que-choisir/ + diff --git a/data/GrabberConfig/signalscv.com.txt b/data/GrabberConfig/signalscv.com.txt new file mode 100644 index 00000000..2d3c388e --- /dev/null +++ b/data/GrabberConfig/signalscv.com.txt @@ -0,0 +1,10 @@ +author: //span[contains(@class, 'byline_1')] +date: //span[@class='posted_date'] +body: //*[contains(@class, 'bigimage_container') or contains(@class, 'overlay_text') or contains(@id, 'articlebody')] + +strip_id_or_class: leftWrapper + +prune: no + +test_url: http://www.signalscv.com/section/46/article/102948/ +test_url: http://www.signalscv.com/syndication/feeds/rss/
\ No newline at end of file diff --git a/data/GrabberConfig/singaporeanstocksinvestor.blogspot.com.txt b/data/GrabberConfig/singaporeanstocksinvestor.blogspot.com.txt new file mode 100644 index 00000000..46e2d5f2 --- /dev/null +++ b/data/GrabberConfig/singaporeanstocksinvestor.blogspot.com.txt @@ -0,0 +1,5 @@ +body: //div[@class='post-body'] +strip: //div[@id='lws_0'] +prune: no + +test_url: http://singaporeanstocksinvestor.blogspot.com/2011/04/aims-amp-capital-industrial-reit.html
\ No newline at end of file diff --git a/data/GrabberConfig/singularityhub.com.txt b/data/GrabberConfig/singularityhub.com.txt new file mode 100644 index 00000000..3999d4d4 --- /dev/null +++ b/data/GrabberConfig/singularityhub.com.txt @@ -0,0 +1,2 @@ +body://div[contains(@class,"entry-content")] +test_url: http://singularityhub.com/2011/05/21/google-invades-your-home-android-phones-control-your-appliances-and-accessories-video/
\ No newline at end of file diff --git a/data/GrabberConfig/skanesfria.se.txt b/data/GrabberConfig/skanesfria.se.txt new file mode 100644 index 00000000..a0ddac79 --- /dev/null +++ b/data/GrabberConfig/skanesfria.se.txt @@ -0,0 +1,7 @@ +body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] +author: //article//div[contains(@class, 'field-byline')] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.skanesfria.se/artikel/112045
\ No newline at end of file diff --git a/data/GrabberConfig/slashfilm.com.txt b/data/GrabberConfig/slashfilm.com.txt new file mode 100644 index 00000000..4d17176a --- /dev/null +++ b/data/GrabberConfig/slashfilm.com.txt @@ -0,0 +1,15 @@ +title: substring-before(//title,'| /Film') +date: substring-before(substring-after(//p[@class='post-info'],'Posted on '),'by') +strip: //div[@class='pm-left'] +strip: //div[@class='pm-right'] +strip: //h2/span +next_page_link: //h2/strong/a +strip: //h2/strong/a +strip: //p[contains(text(),'we have to split this post over')] +strip: //p[@class='post-info'] +strip: //h1/a +strip: //img[contains(@src,'siteimages/authors')] +strip: //div[@id='header'] +strip: //div[@class='topad-right'] +strip: //strong[contains(text(),'Cool Posts From Around the Web:')] +test_url: http://www.slashfilm.com/superhero-bits-206/
\ No newline at end of file diff --git a/data/GrabberConfig/slate.com.txt b/data/GrabberConfig/slate.com.txt new file mode 100644 index 00000000..47c61ec7 --- /dev/null +++ b/data/GrabberConfig/slate.com.txt @@ -0,0 +1,21 @@ +title: //h1[@class="sl-art-head-dek"] +body: //article//div[@class='sl-art-body']/div[contains(@class, 'body')] +strip: //div[@class="department_kicker"] +strip: //div[@id="insider_ad_wrapper" or @id="insider_ad_inner"] +strip: //div[@id="bottom_sponsored_links"] +strip: //div[@class="sl-art-ad-midflex"] +#strip: //dl +#strip: //p[em/a[contains(@href, 'facebook.com')]] +prune: no + +http_header(Cookie): GDPR_consent=1 + +author: //div[@id='author_bio']//a[contains(@href, '/author/')] +author: //a[contains(@href, '/authors.')] + +date: substring-before(substring-after(//span[@class='sl-art-byline'], 'Posted '), ', at ') + +single_page_link: //a[@class='sl-art-sinpage'] + +test_url: http://www.slate.com/id/2274583/pagenum/all/ +test_url: http://www.slate.com/id/2293116/ diff --git a/data/GrabberConfig/slate.fr.txt b/data/GrabberConfig/slate.fr.txt new file mode 100644 index 00000000..b778a419 --- /dev/null +++ b/data/GrabberConfig/slate.fr.txt @@ -0,0 +1,20 @@ +body: //article + +strip_id_or_class: article_author +strip_id_or_class: tag_articles +strip_id_or_class: article_insert +strip_id_or_class: col-right +strip_id_or_class: aside_blockquote +strip_id_or_class: article-header__infos article-infos +strip_id_or_class: article-infos--top-article +strip_id_or_class: tag +strip_id_or_class: category +strip_id_or_class: sharing-tools +strip_id_or_class: feed-articles-aside +strip_id_or_class: article-signature +strip_id_or_class: article-header__breadcrumb + +src_lazy_load_attr: data-full-src + +test_url: http://www.slate.fr/story/115631/twitter-devin-tirages-ligue-champions +test_url: http://www.slate.fr/story/159244/katie-nouvelle-drogue-zombie diff --git a/data/GrabberConfig/slice.seriouseats.com.txt b/data/GrabberConfig/slice.seriouseats.com.txt new file mode 100644 index 00000000..e62a3966 --- /dev/null +++ b/data/GrabberConfig/slice.seriouseats.com.txt @@ -0,0 +1,15 @@ +body: //div[@id='content'] + +# clean up recipe pages +strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] + +#recipe pages +strip_id_or_class: "recipe-feedback" +strip_id_or_class: "comments" +strip_id_or_class: "procedure-number" +strip_id_or_class: "more-with-author" + +#slice +strip_id_or_class: "inner" + +test_url: http://slice.seriouseats.com/archives/2010/10/the-pizza-lab-how-to-make-great-new-york-style-pizza.html
\ No newline at end of file diff --git a/data/GrabberConfig/slog.thestranger.com.txt b/data/GrabberConfig/slog.thestranger.com.txt new file mode 100644 index 00000000..f9526945 --- /dev/null +++ b/data/GrabberConfig/slog.thestranger.com.txt @@ -0,0 +1,4 @@ +strip_id_or_class: postCategory +title: //h3[@class='postTitle'] +body: //div[@class='postBody'] +test_url: http://slog.thestranger.com/slog/archives/2010/10/12/sl-letter-of-the-day-leave-it-alone
\ No newline at end of file diff --git a/data/GrabberConfig/smashingmagazine.com.txt b/data/GrabberConfig/smashingmagazine.com.txt new file mode 100644 index 00000000..53637764 --- /dev/null +++ b/data/GrabberConfig/smashingmagazine.com.txt @@ -0,0 +1,12 @@ +date: //meta[@property="DC.date.issued"]/@content + +strip_id_or_class: lt +strip_id_or_class: top +strip_id_or_class: pmd +strip_id_or_class: hsl +strip_id_or_class: further-reading-on-smashingmag + +strip: //a[@class='sot single'] +strip: //div[@class='drop-caps'] + +test_url: https://www.smashingmagazine.com/2017/06/building-production-ready-css-grid-layout/ diff --git a/data/GrabberConfig/smbc-comics.com.txt b/data/GrabberConfig/smbc-comics.com.txt new file mode 100644 index 00000000..66e3718a --- /dev/null +++ b/data/GrabberConfig/smbc-comics.com.txt @@ -0,0 +1,12 @@ +find_string:display:none +replace_string:display:inline + +find_string:<img src +replace_string:<div/><img src + +body: //*[@id="cc-comicbody"] | //*[@id="aftercomic"] + +test_url: http://www.smbc-comics.com/index.php?id=2039 + +prune: no +tidy: no diff --git a/data/GrabberConfig/sme.sk.txt b/data/GrabberConfig/sme.sk.txt new file mode 100644 index 00000000..d41612cc --- /dev/null +++ b/data/GrabberConfig/sme.sk.txt @@ -0,0 +1,3 @@ +title: //meta[@property='og:title']/@content +date: //p[@class='autor_line']/b/text() +test_url: http://www.sme.sk/c/6268206/lipsic-vidi-malcharkove-uplatky.html
\ No newline at end of file diff --git a/data/GrabberConfig/smh.com.au.txt b/data/GrabberConfig/smh.com.au.txt new file mode 100644 index 00000000..8cc45ed6 --- /dev/null +++ b/data/GrabberConfig/smh.com.au.txt @@ -0,0 +1,18 @@ +body: //article +title: //h1[@class='cN-headingPage'] +author: //h3[@class='authorName'] +date: //dd[@class='updated dtstamp'] + +strip: //ul[@class='social sponsored cfix'] +strip: //div[contains(@class, 'hiddenVisually')] +strip: //dd[@class='updated dtstamp'] +strip: //h3[@class='authorName'] +strip: //ul[@class='social cfix'] +strip: //div[contains(@id, 'adspot')] +strip: //aside + +strip: //div[contains(@class, 'overlayPlayCountdown')] +strip: //div[@class='fdVideoWof']//span[@class='gone'] + +test_url: http://www.smh.com.au/world/donald-trump-travel-ban-us-appeals-court-upholds-suspension-20170209-gu9ta8.html +test_contains: Trump's January 27 executive order barred diff --git a/data/GrabberConfig/smithsonianmag.com.txt b/data/GrabberConfig/smithsonianmag.com.txt new file mode 100644 index 00000000..fc479c2a --- /dev/null +++ b/data/GrabberConfig/smithsonianmag.com.txt @@ -0,0 +1,23 @@ +# meta data +title://h1[@id = 'articleTitle'] +author:substring-after(//ul[@id = 'byLine']/li[1],'By ') +date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',') +body://div[@id = 'article-body'] + +# full content +single_page_link://td/li[@class = 'article-singlepage']/a + +# continue link +single_page_link: //a[@id='continue-btn'] + +# caption clean up +wrap_in(i)://span[@class='articleImageCaptionwide'] +move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p + + +# clean up +strip://p[@id = 'articlePaginationWrapper'] +strip://ul[contains(@class, 'cat-breadcrumb')] +strip://div [@class= 'viewMorePhotos'] + +test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html diff --git a/data/GrabberConfig/snip.ly.txt b/data/GrabberConfig/snip.ly.txt new file mode 100644 index 00000000..4e80fcae --- /dev/null +++ b/data/GrabberConfig/snip.ly.txt @@ -0,0 +1,3 @@ +single_page_link: //meta[@property="og:url"]/@content + +test_url: http://snip.ly/qa1R
\ No newline at end of file diff --git a/data/GrabberConfig/socialter.fr.txt b/data/GrabberConfig/socialter.fr.txt new file mode 100644 index 00000000..126cc491 --- /dev/null +++ b/data/GrabberConfig/socialter.fr.txt @@ -0,0 +1,12 @@ + +body: //div[@id='artLinkAnc'] +body: //div[contains(concat(' ',normalize-space(@class),' '),' artAcc ')] | //div[contains(concat(' ',normalize-space(@class),' '),' artTxt ')] + +author: substring-before(//label[contains(concat(' ',normalize-space(@class),' '),' artAuth ')], ', Le ') + +# remove the sentence about buying the newspaper: +strip: //a[starts-with(@href, 'https://abo.socialter.fr/')]/parent::* + +test_url: http://www.socialter.fr/fr/module/99999672/696/ynis_varoufkis__si_je_deviens_un_bon_politicien_flinguez_moi +test_url: http://www.socialter.fr/fr/module/99999672/659/enqute__comment_leurope_bidouille_ses_objectifs_de_transition_nergtique + diff --git a/data/GrabberConfig/somethingawful.com.txt b/data/GrabberConfig/somethingawful.com.txt new file mode 100644 index 00000000..48547948 --- /dev/null +++ b/data/GrabberConfig/somethingawful.com.txt @@ -0,0 +1,17 @@ +title: //h1 +body: //div[@id = 'content-area'] +author: //p[contains(@class, 'byline')]/a +autodetect_next_page: yes +tidy: no + +strip_id_or_class: articleid +strip_id_or_class: logo +strip_id_or_class: pagebar +strip_id_or_class: featurenavlinks +strip_id_or_class: featured_frontpage +strip_id_or_class: sidebar +strip_id_or_class: footer +strip_id_or_class: byline +strip_id_or_class: logo +strip_id_or_class: nav_network +test_url: http://www.somethingawful.com/d/dungeons-and-dragons/wtf-monster-manual.php
\ No newline at end of file diff --git a/data/GrabberConfig/songshuhui.net.txt b/data/GrabberConfig/songshuhui.net.txt new file mode 100644 index 00000000..a9233593 --- /dev/null +++ b/data/GrabberConfig/songshuhui.net.txt @@ -0,0 +1,7 @@ +# This filter is tested on: +# http://songshuhui.net/archives/65522 +# http://songshuhui.net/archives/75760 +title://h2/span/a +date:substring-before(substring-after(//div[@class='atrctitle']/div, '发表于'),' |') +body://div[@class='entry'] +test_url: http://songshuhui.net/archives/74819
\ No newline at end of file diff --git a/data/GrabberConfig/soundcity.tv.txt b/data/GrabberConfig/soundcity.tv.txt new file mode 100644 index 00000000..c26b9f95 --- /dev/null +++ b/data/GrabberConfig/soundcity.tv.txt @@ -0,0 +1,3 @@ +strip_id_or_class: sharing + +test_url: http://soundcity.tv/feed/ diff --git a/data/GrabberConfig/sourcebooks.com.txt b/data/GrabberConfig/sourcebooks.com.txt new file mode 100644 index 00000000..86e3df5e --- /dev/null +++ b/data/GrabberConfig/sourcebooks.com.txt @@ -0,0 +1,4 @@ +#grab the actual content div +body: //div[@class='rt-article'] + +test_url: http://www.sourcebooks.com/blog/happy-27th-birthday-sourcebooks.html diff --git a/data/GrabberConfig/sowetanlive.co.za.txt b/data/GrabberConfig/sowetanlive.co.za.txt new file mode 100644 index 00000000..2a1983cc --- /dev/null +++ b/data/GrabberConfig/sowetanlive.co.za.txt @@ -0,0 +1,7 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-widgets ')] +strip_id_or_class: related_articles +find_string:=s180" +replace_string:=s750" + +test_url: https://www.sowetanlive.co.za/rss/?publication=sowetan-live +test_url: https://www.sowetanlive.co.za/sundayworld/news/2017-11-08-manana-should-pay-medical-bills-for-assault-victims-court-told/ diff --git a/data/GrabberConfig/spectrum.ieee.org.txt b/data/GrabberConfig/spectrum.ieee.org.txt new file mode 100644 index 00000000..aea3627e --- /dev/null +++ b/data/GrabberConfig/spectrum.ieee.org.txt @@ -0,0 +1,3 @@ +body://div[@class="articleBody"] +author://p[@class="articleBodyTtl"] +test_url: http://spectrum.ieee.org/semiconductors/processors/behind-intels-new-randomnumber-generator/
\ No newline at end of file diff --git a/data/GrabberConfig/spektrum.de.txt b/data/GrabberConfig/spektrum.de.txt new file mode 100644 index 00000000..232bda3c --- /dev/null +++ b/data/GrabberConfig/spektrum.de.txt @@ -0,0 +1,17 @@ +body: //article[contains(@class, 'content')] + +date: //li[contains(@class, 'content__meta__date')] + +title: //h2[contains(@class, 'content__title')] + +author: //div[contains(@class, 'content__author__info__name')]//strong + +strip: //div[contains(@class, 'content__kicker')] +strip: //figure[contains(@class, 'image__article__top')] +strip: //div/div/ul[contains(@class, 'breadcrumbs')] +strip: //aside/ul[contains(@class, 'content__meta')] +strip: //aside[contains(@class, 'hide-for-print')] +strip: //div[contains(@class, 'hide-for-print')] +strip: //div[contains(@class, 'content__author')] + +test_url: https://www.spektrum.de/news/europas-vernichtende-jahrtausendduerre/1584414 diff --git a/data/GrabberConfig/spiegel.de.txt b/data/GrabberConfig/spiegel.de.txt new file mode 100644 index 00000000..7b7b1752 --- /dev/null +++ b/data/GrabberConfig/spiegel.de.txt @@ -0,0 +1,96 @@ +# A. Niepel, narya.de@... +# - added single_page_link +# - added author for default and single page view +# - added date for single page view +# fforst@... +# - Fixed it +# bode2104@... +# - Fixed single_page_link +# - Included intro text in single page view +# - Added body in default view +# stesie@ +# - removed copyright box +# - removed "print more" box + +# set body +tidy: no +# body in single page view +body: //div[@id="spArticleContent"] +# body in default view +body: //div[@id="spArticleSection"] +body: //div[contains(@class, 'article-section')] | //div[@id='js-article-top-wide-asset'] | //p[contains(@class, 'article-intro')] | //div[contains(@class, 'js-module-box-image')] +# body in "Fotostrecke" +body: //div[@id="spBigaContent"] + +# set date in single page view +date: //div[@id="spArticleContent"]/h3 +# strip date +strip: //div[@id="spArticleContent"]/h3 +# set date in "Fotostrecke" +date: //div[@id="spBigaDatum"] + +# title in default view +title: //h2[contains(@class, 'article-title')] +#set title in single page view +title: //div[@id='spArticleContent']/h2 +# strip title +strip: //div[@id='spArticleContent']/h1 +strip: //div[@id='spArticleContent']/h2 +#set title in "Fotostrecke" +title: //div[@class='spBigaHeadline'] + +# set author +author: //p[@class="spAuthor"]/a +author: substring-after(//p[@class="spAuthor"], 'Von ') +# strip author +strip: //p[@class='spAuthor'] + +# remove captions +strip: //*/span[@class='spPicLayerText'] +strip: //*/div[@class='spPanoPlayerPaneControl'] +strip: //*/div[@class='spCredit'] +strip: //*/div[@class='spCredit']/following-sibling::p + +# remove ads +strip: //div[@class='spMInline'] + +# remove photogalleries and extras +strip: //div[contains(@class, 'spPhotoGallery')] +strip: //div[@class='spPhotoGallery']/following-sibling::br +strip: //div[@class='spAssetAlignleft'] +strip: //div[contains(@class,'spAsset')] +strip: //br[@clear='all'] + +# remove community functions +strip: //div[@id='spSocialBookmark'] +strip: //div[contains(@class, 'spCommunityBox')] +strip: //div[contains(@class, 'spArticleNewsfeedBox')] +strip: //div[@class='spArticleCredit'] + +# remove clutter in "Fotostrecke" +strip: //div[@id='spBreadcrumb'] +strip: //div[@id='spBigaLatestEntries'] +strip: //div[contains(@class, 'spBigaNavi')] +strip: //div[@class='spDottedLine'] + +strip: //div[@class='asset-box article-print-more'] +strip: //div[@class='article-copyright'] +strip: //span[@class='image-buttons'] + +# Use link to print article for single page view +single_page_link: //a[contains(@href, '-druck')] +if_page_contains: //div[contains(@class, 'multi-pager-control')] + +# Clean up title in print view +find_string: <title>Druckversion - +replace_string: <title> + +# use next link in "Fotostrecke" +next_page_link: //a[@class='spBigaControlForw'] +test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html + +# regular article +test_url: http://www.spiegel.de/wirtschaft/soziales/griechenland-was-den-griechischen-buergern-nun-droht-a-1042682.html + +# multipage article +test_url: http://www.spiegel.de/spiegel/a-710880.html
\ No newline at end of file diff --git a/data/GrabberConfig/spiil.org.txt b/data/GrabberConfig/spiil.org.txt new file mode 100644 index 00000000..dd12fb88 --- /dev/null +++ b/data/GrabberConfig/spiil.org.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.spiil.org%2Fguide-bonnes-pratiques-editeurs-de-presse-ligne + +body: //div[contains(concat(' ',normalize-space(@class),' '),' node-content ')] +test_url: https://www.spiil.org/guide-bonnes-pratiques-editeurs-de-presse-ligne diff --git a/data/GrabberConfig/spiked-online.com.txt b/data/GrabberConfig/spiked-online.com.txt new file mode 100644 index 00000000..7ec39c2b --- /dev/null +++ b/data/GrabberConfig/spiked-online.com.txt @@ -0,0 +1,7 @@ +title: //div[@id='articleTitleWrapper' or @id='mainFeature']//h1 +author: //*[@id='authorNameJob']//a +date: //div[@id='articleMeta']/p +body: //div[@id='mainFeature']//img | //div[contains(@class, 'fullText')] + +test_url: http://www.spiked-online.com/newsite/article/standing_up_to_the_white-coated_gods_of_fortune/13785 +test_url: http://www.spiked-online.com/newsite/article/sex_box_and_the_crisis_of_intimacy/14168
\ No newline at end of file diff --git a/data/GrabberConfig/spin.com.txt b/data/GrabberConfig/spin.com.txt new file mode 100644 index 00000000..88eb454c --- /dev/null +++ b/data/GrabberConfig/spin.com.txt @@ -0,0 +1,5 @@ +tidy: no +body: //section[contains(@class, 'main')] +strip: //footer +strip: //a[@class='paginated'] +test_url: http://www.spin.com/articles/bathlands-deep-heart-americas-new-drug-nightmare
\ No newline at end of file diff --git a/data/GrabberConfig/splitsider.com.txt b/data/GrabberConfig/splitsider.com.txt new file mode 100644 index 00000000..4bbc7aac --- /dev/null +++ b/data/GrabberConfig/splitsider.com.txt @@ -0,0 +1,4 @@ +author: //div[@class='byline']/a +date: //div[@id='date'] +body: //div[@class='entry'] +test_url: http://splitsider.com/2011/10/saturday-nights-children-rob-riggle-2004-2005/
\ No newline at end of file diff --git a/data/GrabberConfig/sport.detik.com.txt b/data/GrabberConfig/sport.detik.com.txt new file mode 100644 index 00000000..18552d1e --- /dev/null +++ b/data/GrabberConfig/sport.detik.com.txt @@ -0,0 +1,8 @@ +title://div[@class="content_detail"]/h1 + +author://div[@class="author"]/strong + +date:substring-before(substring-after(//div[@class="content_detail"]/*[@class="date"], ','), ' WIB') + +body://div[@class='text_detail'] +test_url: http://sport.detik.com/sepakbola/read/2012/05/23/065011/1922350/71/agen-silva-ingin-bertahan-di-milan?b99220270
\ No newline at end of file diff --git a/data/GrabberConfig/sport365.fr.txt b/data/GrabberConfig/sport365.fr.txt new file mode 100644 index 00000000..8688f40b --- /dev/null +++ b/data/GrabberConfig/sport365.fr.txt @@ -0,0 +1,8 @@ +body: //h2[contains(@class, 'body_head')] | //div[@id='img_article' or contains(@class, 'body_content')] +body: //div[contains(@class, 'cpanel')]//div[contains(@class, 'thumbnails')] +prune: no +strip: //div[starts-with(@class, 'actu_')] +strip: //div[contains(@class, 'data')] + +test_url: http://www.sport365.fr/basketball/nba/new-york-accord-avec-toronto-pour-bargnani-1038773.shtml +test_url: http://www.sport365.fr/rss.xml
\ No newline at end of file diff --git a/data/GrabberConfig/sports.yahoo.com.txt b/data/GrabberConfig/sports.yahoo.com.txt new file mode 100644 index 00000000..b0f57e2c --- /dev/null +++ b/data/GrabberConfig/sports.yahoo.com.txt @@ -0,0 +1,9 @@ +title: //div[@id='article']/div[@class='hd']/h1 +body: //p[@id='byline'] | //div[@id='article']//div[@class='body_copy 0'] +strip: //div[@class='foot'] +strip: //div[@id='sidebar']//div[@class='ft'] +strip: //p[@id='byline']//em +tidy: no +prune: no + +test_url: http://sports.yahoo.com/nba/news?slug=ap-nbafinals
\ No newline at end of file diff --git a/data/GrabberConfig/sprengsatz.de.txt b/data/GrabberConfig/sprengsatz.de.txt new file mode 100644 index 00000000..5b683811 --- /dev/null +++ b/data/GrabberConfig/sprengsatz.de.txt @@ -0,0 +1,5 @@ +title: //h2 +author: string('Michael Spreng') +date: //div[@class='date'] +body: //div[@class='entry'] +test_url: http://www.sprengsatz.de/?p=3691
\ No newline at end of file diff --git a/data/GrabberConfig/sputniknews.com.txt b/data/GrabberConfig/sputniknews.com.txt new file mode 100644 index 00000000..c3f4234e --- /dev/null +++ b/data/GrabberConfig/sputniknews.com.txt @@ -0,0 +1,4 @@ +body: //div[contains(concat(' ',normalize-space(@class),' '),' b-article__header ')] | //div[contains(@class, 'b-article__lead') or contains(@class, 'b-article__text') or contains(@class, 'b-article__videoclub_embed')] + +test_url: https://sputniknews.com/world/201708131056427079-lefties-scientists-study/ +test_contains: Lefties are luckier as diff --git a/data/GrabberConfig/sqlite.org.txt b/data/GrabberConfig/sqlite.org.txt new file mode 100644 index 00000000..15763c32 --- /dev/null +++ b/data/GrabberConfig/sqlite.org.txt @@ -0,0 +1,7 @@ +body: //div[@id='ff-body'] + +replace_string(<h1 align=center>): <div id="ff-body"><h1 align=center> + +prune: no + +test_url: http://www.sqlite.org/fileformat2.html
\ No newline at end of file diff --git a/data/GrabberConfig/squashed.tumblr.com.txt b/data/GrabberConfig/squashed.tumblr.com.txt new file mode 100644 index 00000000..8eae13ed --- /dev/null +++ b/data/GrabberConfig/squashed.tumblr.com.txt @@ -0,0 +1,4 @@ +body: //div[@class='content'] +date: substring-before( //div[@class='unit dateAndNotes'], 'with') +title: //h3 +test_url: http://squashed.tumblr.com/post/17613522228/lets-stop-blaming-the-victims-of-predatory-lending
\ No newline at end of file diff --git a/data/GrabberConfig/srf.ch.txt b/data/GrabberConfig/srf.ch.txt new file mode 100644 index 00000000..d07a9050 --- /dev/null +++ b/data/GrabberConfig/srf.ch.txt @@ -0,0 +1,24 @@ +# Author: cirnod@gmail.com + +tidy: no +prune: no + +body: //div[@id="article-content"]/p | //div[@class="main-article-content clearfix"] + +# General Cleanup +strip_id_or_class: offscreen +strip_id_or_class: video-description +strip_id_or_class: v2 big-video +strip_id_or_class: module smb freetext +strip_id_or_class: asset span3 +strip_id_or_class: module smb related-links + +# fix image-galleries +strip_id_or_class: module lightbox-gallery image hide +replace_string(width="624"): width="100%" +replace_string(height="468"): height="%" + +# Try yourself +test_url: http://www.srf.ch/news/wirtschaft/weltbank-korrigiert-konjunktur-erwartungen-nach-unten +test_url: http://www.srf.ch/news/wirtschaft/ural-statt-alpen-russische-touristen-bleiben-zuhause +test_url: http://www.srf.ch/news/international/zwei-schweizer-bei-blutigem-attentat-in-mali-verletzt
\ No newline at end of file diff --git a/data/GrabberConfig/stackoverflow.blog.txt b/data/GrabberConfig/stackoverflow.blog.txt new file mode 100644 index 00000000..3b013a89 --- /dev/null +++ b/data/GrabberConfig/stackoverflow.blog.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fstackoverflow.blog%2F2015%2F01%2F16%2Fwhy-we-still-believe-in-private-offices%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' m-post-content ')] +test_url: https://stackoverflow.blog/2015/01/16/why-we-still-believe-in-private-offices/
\ No newline at end of file diff --git a/data/GrabberConfig/stackoverflow.com.txt b/data/GrabberConfig/stackoverflow.com.txt new file mode 100644 index 00000000..bb95e93a --- /dev/null +++ b/data/GrabberConfig/stackoverflow.com.txt @@ -0,0 +1,14 @@ +body: //div[@class='post-text' or @class='user-action-time' or @class='user-details' or @class='vote'] | //div[@id='answers-header']//h2 + +replace_string(<div class="user-details"><br></div>): <!-- nothing --> +replace_string(<div class="vote">): <div class="vote"><h3>Vote count: + +strip_id_or_class: vote-up +strip_id_or_class: vote-down +strip_id_or_class: star-off +strip_id_or_class: favoritecount +strip_id_or_class: -share +strip_id_or_class: badgecount + + +test_url: http://stackoverflow.com/questions/4484289/id-like-to-understand-the-jquery-plugin-syntax
\ No newline at end of file diff --git a/data/GrabberConfig/standard.co.uk.txt b/data/GrabberConfig/standard.co.uk.txt new file mode 100644 index 00000000..71a2bda1 --- /dev/null +++ b/data/GrabberConfig/standard.co.uk.txt @@ -0,0 +1,16 @@ +autodetect_next_page: no +footnotes: no +dissolve: //div[@class="column-2"]//div[@class="widget"] +dissolve: //div[@class="column-2"]//div + +author: //div[@class="innerbyline"]/a +strip: //div[@class="innerbyline"]/a + +strip: //p[@class="dateline"] +date: //p[@class="dateline"] + +title: //h1[@class="title"] +author: //div[@class="innerbyline"]/a +date: //p[@class="dateline"] +body: //div[@class="column-2"] +test_url: http://www.standard.co.uk/lifestyle/esmagazine/grace-and-flavour-pizarro-7938350.html
\ No newline at end of file diff --git a/data/GrabberConfig/stephenfry.com.txt b/data/GrabberConfig/stephenfry.com.txt new file mode 100644 index 00000000..efd1ec2b --- /dev/null +++ b/data/GrabberConfig/stephenfry.com.txt @@ -0,0 +1,8 @@ +title: /html/head/meta[@name='title']/@content +author: //span[contains(concat(' ',normalize-space(@class),' '),' article_author ')]/a +date: //span[contains(concat(' ',normalize-space(@class),' '),' article_date ')] + +body: //div[@class='entry-content'] + +single_page_link: //p[@class='pagination']/a +test_url: http://www.stephenfry.com/2011/10/06/steve-jobs/
\ No newline at end of file diff --git a/data/GrabberConfig/stjv.fr.txt b/data/GrabberConfig/stjv.fr.txt new file mode 100644 index 00000000..2c15fb4a --- /dev/null +++ b/data/GrabberConfig/stjv.fr.txt @@ -0,0 +1,3 @@ +date: //time[contains(@class, 'published')]/@datetime + +test_url: https://www.stjv.fr/2018/01/au-sujet-quantic-dream/ diff --git a/data/GrabberConfig/stockholmsfria.se.txt b/data/GrabberConfig/stockholmsfria.se.txt new file mode 100644 index 00000000..ee874832 --- /dev/null +++ b/data/GrabberConfig/stockholmsfria.se.txt @@ -0,0 +1,7 @@ +body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] +author: //article//div[contains(@class, 'field-byline')] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.stockholmsfria.se/artikel/112068 diff --git a/data/GrabberConfig/straightdope.com.txt b/data/GrabberConfig/straightdope.com.txt new file mode 100644 index 00000000..f01d7ad1 --- /dev/null +++ b/data/GrabberConfig/straightdope.com.txt @@ -0,0 +1,6 @@ +body: //div[@id='article' or @id='current_illustration'] +title: //div[@id='article']//h1 +date: //div[@id='article']//div[@class='date'] +prune: no + +test_url: http://www.straightdope.com/columns/read/947/whatever-happened-to-adoption-of-the-metric-system-in-the-u-s
\ No newline at end of file diff --git a/data/GrabberConfig/stratfor.com.txt b/data/GrabberConfig/stratfor.com.txt new file mode 100644 index 00000000..f7a64598 --- /dev/null +++ b/data/GrabberConfig/stratfor.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.stratfor.com%2Fanalysis%2Frussias-plans-arctic-supremacy + +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-body-wrap ')] +test_url: https://www.stratfor.com/analysis/russias-plans-arctic-supremacy diff --git a/data/GrabberConfig/streetsblog.net.txt b/data/GrabberConfig/streetsblog.net.txt new file mode 100644 index 00000000..6cf03ca6 --- /dev/null +++ b/data/GrabberConfig/streetsblog.net.txt @@ -0,0 +1,7 @@ +title: //h2[@class="post-title"] +date: //span[@class="post-date"] +body: //div[@class="post-entry"] + +#This is also good for *.streetsblog.org, for example: +#http://dc.streetsblog.org/2011/10/21/friday-job-market/ +test_url: http://streetsblog.net/2011/10/20/look-out-below-one-in-nine-bridges-structurally-deficient-reports-t4a/
\ No newline at end of file diff --git a/data/GrabberConfig/stuff.co.nz.txt b/data/GrabberConfig/stuff.co.nz.txt new file mode 100644 index 00000000..c1a7e574 --- /dev/null +++ b/data/GrabberConfig/stuff.co.nz.txt @@ -0,0 +1,21 @@ +title://div[@id='left_col']/h1 +author:substring-after(//span[contains(@class,'storycredit')],'BY ') +author://span[contains(@class,'storycredit')] +date:substring-after(//div[contains(@class,'toolbox_date')],'Last updated ') +date://div[contains(@class,'toolbox_date')] +body://div[@id='left_col'] + +strip_id_or_class: toolbox +strip_id_or_class: story_features +strip_id_or_class: sharebox_new +strip_id_or_class: related_box +strip_id_or_class: sponsored_links +strip_id_or_class: hidden_ad +strip_id_or_class: story_content_top +strip_id_or_class: total_number +strip_id_or_class: sort_order +strip_id_or_class: subscribe_order + +strip://div[contains(@class,'ad_story')] + +test_url: https://www.stuff.co.nz/business/103470616/behind-the-pricing-internal-email-lifts-veil-on-bps-petrol-prices diff --git a/data/GrabberConfig/stumbleupon.com.txt b/data/GrabberConfig/stumbleupon.com.txt new file mode 100644 index 00000000..9adc3c50 --- /dev/null +++ b/data/GrabberConfig/stumbleupon.com.txt @@ -0,0 +1,3 @@ +single_page_link: //iframe[@id='tb-stumble-frame']/@src + +test_url: http://www.stumbleupon.com/su/35V0wB/zouchmagazine.com/poetry-violet/
\ No newline at end of file diff --git a/data/GrabberConfig/subtraction.com.txt b/data/GrabberConfig/subtraction.com.txt new file mode 100644 index 00000000..9ba6eb77 --- /dev/null +++ b/data/GrabberConfig/subtraction.com.txt @@ -0,0 +1,17 @@ +title: //*[@id='posts']/div[1]/h2 +author: //*[@id='posts']/div[1]/div[2]/span[2]/a +date: //*[@class='date'] +body: //div[@class='body-lead'] + +# take out the bit saying 'body' +strip: //div[@class='body-lead']/div[@class='info-label'] + + + + + + + + + +test_url: http://www.subtraction.com/2011/02/01/unnecessary-explanations
\ No newline at end of file diff --git a/data/GrabberConfig/sueddeutsche.de.txt b/data/GrabberConfig/sueddeutsche.de.txt new file mode 100644 index 00000000..30eed9b0 --- /dev/null +++ b/data/GrabberConfig/sueddeutsche.de.txt @@ -0,0 +1,36 @@ +# Important Note: +# The last paragraph of any article seems to be a teaser from the sports section. + +tidy: no +prune: no +autodetect_on_failure: yes + +date: //time[@class='timeformat'] +title: //section[@class='header']//h2 +body: //article[@id="sitecontent"] +author: //meta[@name="author"]/@content + +replace_string(<h2>): <h3> +replace_string(</h2>): </h3> + +#strip: //section[@class="header"] +strip: //section[@class='header']//time +strip: //section[@class='header']//strong +strip: //div[@id="article-sidebar-wrapper"] +strip: //p[@class="anzeige"] +strip: //section[@class="authors"] +strip_id_or_class: teaserable-layout +strip: //noscript +strip: //div//i + +next_page_link: //a[@class='article-paging-nav-btn'] + +# 1 page +test_url: http://www.sueddeutsche.de/reise/bergwinter-saisonstart-in-den-skigebieten-geht-schon-1.3251799 +test_contains: Ranalt. Der steile Anstieg kurz nach dem Tunnel. +# 2 pages +test_url: http://www.sueddeutsche.de/kultur/politik-und-philosophie-warum-trump-jedes-mittel-recht-ist-1.3256078 +test_contains: Sucht man die Spuren des Pragmatismus +# 3 pages +test_url: http://www.sueddeutsche.de/muenchen/mietshaus-am-gaertnerplatz-alles-muss-raus-1.1556693 +test_contains: Kammer zugemauert gewesen und das diff --git a/data/GrabberConfig/summitroute.com.txt b/data/GrabberConfig/summitroute.com.txt new file mode 100644 index 00000000..57634913 --- /dev/null +++ b/data/GrabberConfig/summitroute.com.txt @@ -0,0 +1,3 @@ +prune: no + +test_url: https://summitroute.com/blog/2015/12/24/instagram_bounty_case_study_for_defense/ diff --git a/data/GrabberConfig/sunshinecoastdaily.com.au.txt b/data/GrabberConfig/sunshinecoastdaily.com.au.txt new file mode 100644 index 00000000..46eccec0 --- /dev/null +++ b/data/GrabberConfig/sunshinecoastdaily.com.au.txt @@ -0,0 +1,12 @@ +body: //section//article//p + +strip: //aside +strip: //div[@class='margin-top-15'] +strip: //p[@class='tags'] + +author: //span[@class='byline']//ul[@class='piped']//li[1] +date: //span[@class='byline']//ul[@class='piped']//li[2] + +parser: html5php + +test_url: https://www.sunshinecoastdaily.com.au/news/questions-over-accuseds-clothes-in-milos-murder-tr/3141729/ diff --git a/data/GrabberConfig/svd.se.txt b/data/GrabberConfig/svd.se.txt new file mode 100644 index 00000000..bc0a1ca0 --- /dev/null +++ b/data/GrabberConfig/svd.se.txt @@ -0,0 +1,14 @@ +body: //div[@id='article-content'] +author: //div[@id='article']//div[@class='byline']/p + +# Ads +strip_id_or_class: articlead + +# Sharing +strip_id_or_class: share + +prune: no + +test_url: http://www.svd.se/nyheter/inrikes/oppositionen-stoppar-skattesankning_8531228.svd +test_url: http://www.svd.se/nyheter/inrikes/manga-huggormsbitna-golfare_5004031.svd +test_url: http://www.svd.se/?service=rss&type=senastenytt
\ No newline at end of file diff --git a/data/GrabberConfig/svt.se.txt b/data/GrabberConfig/svt.se.txt new file mode 100644 index 00000000..779cfb1b --- /dev/null +++ b/data/GrabberConfig/svt.se.txt @@ -0,0 +1,17 @@ +title: //article[@role='main']//h1 +body: //article[@role='main'] +strip: //aside +replace_string(<noscript>): <div> +replace_string(</noscript>): </div> +strip_id_or_class: svtHide-No-Js +strip_id_or_class: aside +strip_id_or_class: Aside +strip_id_or_class: hidden +strip_id_or_class: Share +tidy: no +prune: no + +test_url: http://www.svt.se/ug/framtidsdrommar-om-jobb-blev-lackande-gifthal +test_contains: Det pågår alltså en gruvboom i Sverige +test_url: http://www.svt.se/nyheter/het-debatt-mellan-borg-och-andersson +test_contains: Anders Borg (M) började med diff --git a/data/GrabberConfig/switchonpaper.com.txt b/data/GrabberConfig/switchonpaper.com.txt new file mode 100644 index 00000000..cdd7cda6 --- /dev/null +++ b/data/GrabberConfig/switchonpaper.com.txt @@ -0,0 +1,25 @@ + +body: //div[@itemprop='text'] +author: //div[contains(concat(' ',normalize-space(@class),' '),' postauthor ')]//div[contains(concat(' ',normalize-space(@class),' '),' authordesc ')]//a + +strip_id_or_class: postinfo +strip_id_or_class: pmpro_content_message + +test_url: https://www.switchonpaper.com/2018/07/18/otobong-nkanga-ethique-de-la-cooperation/ + +# ---------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +# ---------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //div[contains(concat(' ',normalize-space(@class),' '),' pmpro_content_message ')] +login_uri: https://www.switchonpaper.com/login/ +login_username_field: log +login_password_field: pwd +login_extra_fields: redirect_to=https://www.switchonpaper.com/wp-admin/ +login_extra_fields: rememberme=forever +login_extra_fields: submit=Se+connecter +login_extra_fields: testcookie=@=xpath('//input[@name="testcookie"]', request_html('https://www.switchonpaper.com/se-connecter/')) + +test_url: https://www.switchonpaper.com/2018/07/25/gustav-metzgeragir-ou-perir/ + + diff --git a/data/GrabberConfig/sydsvenskan.se.txt b/data/GrabberConfig/sydsvenskan.se.txt new file mode 100644 index 00000000..c606411c --- /dev/null +++ b/data/GrabberConfig/sydsvenskan.se.txt @@ -0,0 +1,5 @@ +body: //div[contains(@class, 'article-content')] + +test_url: http://www.sydsvenskan.se/2017-02-09/svensk-gripen-misstankt-for-kokainsmugglingen-i-landskrona +test_contains: De fann 18 kilo kokain, värt 55 miljoner kronor +test_url: http://www.sydsvenskan.se/rss.xml diff --git a/data/GrabberConfig/symmetrymagazine.org.txt b/data/GrabberConfig/symmetrymagazine.org.txt new file mode 100644 index 00000000..5bcfb9ef --- /dev/null +++ b/data/GrabberConfig/symmetrymagazine.org.txt @@ -0,0 +1,12 @@ +title: //div[contains(@class, "post")]/h2 + +author: //div[contains(@class, "post")]/p[position()=last()]/text()[1] + +date: //div[contains(@class, "post")]/p[1] + +body: //div[contains(@class, "post")] + +strip: //div[contains(@class, "post")]/h2[1] +strip: //div[contains(@class, "post")]/p[1] +strip: //div[contains(@class, "post")]/p[position()=last()] +test_url: http://www.symmetrymagazine.org/breaking/?p=12784
\ No newline at end of file diff --git a/data/GrabberConfig/sz-magazin.sueddeutsche.de.txt b/data/GrabberConfig/sz-magazin.sueddeutsche.de.txt new file mode 100644 index 00000000..e058032c --- /dev/null +++ b/data/GrabberConfig/sz-magazin.sueddeutsche.de.txt @@ -0,0 +1,15 @@ +title: //h1 +body://div[@class='drucken'] +author: substring-after(//span[@class='autor'], 'Von ') +author: //span[@class='autor'] + +single_page_link://a[contains(@href, '/drucken/')] +convert_double_br_tags:yes + +dissolve://div[@class='vorspann'] + +strip://h1 +strip_id_or_class: klassifizierung +strip_id_or_class: source +strip_id_or_class: autor +test_url: http://sz-magazin.sueddeutsche.de/texte/anzeigen/37567
\ No newline at end of file diff --git a/data/GrabberConfig/t3n.de.txt b/data/GrabberConfig/t3n.de.txt new file mode 100644 index 00000000..c194b639 --- /dev/null +++ b/data/GrabberConfig/t3n.de.txt @@ -0,0 +1,3 @@ +next_page_link: //link[@rel='next'] + +test_url: http://t3n.de/news/zukunftstag-2017-t3n-818436/ diff --git a/data/GrabberConfig/tabletmag.com.txt b/data/GrabberConfig/tabletmag.com.txt new file mode 100644 index 00000000..58b1f5bb --- /dev/null +++ b/data/GrabberConfig/tabletmag.com.txt @@ -0,0 +1,5 @@ +body: //div[contains(@class, 'story-text')] + +strip_id_or_class: related + +test_url: http://www.tabletmag.com/jewish-news-and-politics/181181/mossberg-parallel-states?all=1
\ No newline at end of file diff --git a/data/GrabberConfig/tagblatt.de.txt b/data/GrabberConfig/tagblatt.de.txt new file mode 100644 index 00000000..155ae8a9 --- /dev/null +++ b/data/GrabberConfig/tagblatt.de.txt @@ -0,0 +1,8 @@ +# Generated by FiveFilters.org's web-based selection tool +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.tagblatt.de%2FNachrichten%2F19-Jaehrige-taeuschte-eine-Ohnmacht-vor-und-machte-sich-ueber-die-Retter-lustig-374860.html + +title: //header//h2 +date: //div[contains(concat(' ',normalize-space(@class),' '),' artikelhead ')]//span +author: //div[contains(concat(' ',normalize-space(@class),' '),' artikelhead ')]//ul//li +body: //div[contains(concat(' ',normalize-space(@class),' '),' main-article ')]//p | //div[contains(concat(' ',normalize-space(@class),' '),' divcontent ')]//div +test_url: https://www.tagblatt.de/Nachrichten/19-Jaehrige-taeuschte-eine-Ohnmacht-vor-und-machte-sich-ueber-die-Retter-lustig-374860.html diff --git a/data/GrabberConfig/tagesanzeiger.ch.txt b/data/GrabberConfig/tagesanzeiger.ch.txt new file mode 100644 index 00000000..45c5cd02 --- /dev/null +++ b/data/GrabberConfig/tagesanzeiger.ch.txt @@ -0,0 +1,14 @@ +# Author: cirnod@gmail.com + +tidy: no +prune: no + +body: //div[@id="article"]/h3 | //*[@id="mainContent"] + +# General Cleanup +#strip_id_or_class: info_panel + + +# Try yourself +test_url: http://www.tagesanzeiger.ch/zuerich/stadt/Nach-spektakulaerer-Abseilaktion-verhaftet/story/18039895 +test_url: http://www.tagesanzeiger.ch/ausland/naher-osten-und-afrika/IS-zerstoert-auch-das-antike-Hatra/story/19865699 diff --git a/data/GrabberConfig/tagesschau.de.txt b/data/GrabberConfig/tagesschau.de.txt new file mode 100644 index 00000000..c1e7f76f --- /dev/null +++ b/data/GrabberConfig/tagesschau.de.txt @@ -0,0 +1,15 @@ +body: //div[contains(@class, 'sectionArticle') and contains(@class, 'sectionZ')] + +strip_id_or_class: infokasten +strip_id_or_class: teaserImTeaser +strip_id_or_class: Comments +strip_id_or_class: mediaInfo +strip: //div[contains(@class, 'mediaCon')]//iframe +strip_id_or_class: metablockwrapper + +prune: no + +test_url: http://www.tagesschau.de/kommentar/nordkorea-suedkorea-113.html +test_contains: Staatsmann wahrgenommen + +test_url: http://www.tagesschau.de/xml/rss2 diff --git a/data/GrabberConfig/tagesspiegel.de.txt b/data/GrabberConfig/tagesspiegel.de.txt new file mode 100644 index 00000000..57e7d3df --- /dev/null +++ b/data/GrabberConfig/tagesspiegel.de.txt @@ -0,0 +1,60 @@ +# Author: zinnober +# Should work with "normal" articles as well as with image galleries + +prune: no + +# Title +title: //h1/span[@class='hcf-headline'] + +# Set author +author: //a[@rel='author'] + +# Set date +date: //span[@class='date hcf-atlas'] + +# Fetch full multipage articles +next_page_link: //a[contains(@class, 'hcf-forward')] + +# Content is here +body: //article +body: //div[contains(@class, 'hcf-screen')] + +# Remove tracking and ads +strip_id_or_class: hcf-ad +strip_id_or_class: hcf-autoload-ad +strip_id_or_class: hcf-content-ad + +# Tidy up before article +strip: //article/h1 +strip_id_or_class: hcf-atlas +strip_id_or_class: hcf-author +strip_id_or_class: date hcf-atlas +strip_id_or_class: date hcf-atlas + +# General cleanup +strip: //div[contains(@class, 'hcf-screen')]//h1 +strip: //div[@class='hcf-subpage-titles']//ul +strip_id_or_class: hcf-doctype-media +strip_id_or_class: hcf-inline-gallery +strip_id_or_class: hcf-doctype-video +strip_id_or_class: hcf-links +strip_id_or_class: hcf-mini-navi +strip_id_or_class: hcf-media-control +strip_id_or_class: hcf-hidden +replace_string(<span class="hcf-update">Update</span>): <strong>Update: </strong> + +# Fix pictures and captions +replace_string(<a class="hcf-doctype-gallery): <p class="hcf-doctype-gallery +replace_string(<a class="hcf-doctype-enlarge): <p class="hcf-doctype-enlarge +replace_string(<figcaption class="hcf-caption">): <br><small><em> +replace_string(</figcaption>): </em></small> + +# Fix image galleries +replace_string(<a class=" ajaxify): <p class="ajaxify +replace_string(<div class="hcf-caption"><div><p>): <small><em> + +# Try it yourself +test_url: http://www.tagesspiegel.de/berlin/bezirke/wedding/wedding-jetzt/auf-der-suche-nach-einem-stadtteil-wilder-weiter-wedding/8757156.html +test_url: http://www.tagesspiegel.de/berlin/olympia-in-berlin-der-flughafen-tegel-soll-das-olympische-dorf-werden/10645036.html +test_url: http://www.tagesspiegel.de/mediacenter/fotostrecken/berlin/bildergalerie-kreuzberger-der-woche/9305534.html + diff --git a/data/GrabberConfig/takt-magazin.de.txt b/data/GrabberConfig/takt-magazin.de.txt new file mode 100644 index 00000000..62bcb5fa --- /dev/null +++ b/data/GrabberConfig/takt-magazin.de.txt @@ -0,0 +1,9 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.takt-magazin.de%2Fmusik%2Fradio-havanna_158176 +body: //article[contains(concat(' ',normalize-space(@class),' '),' post ')] + +strip_id_or_class: post-pagination +strip_id_or_class: post-related + +test_url: https://www.takt-magazin.de/musik/radio-havanna_158176 diff --git a/data/GrabberConfig/tasteofhome.com.txt b/data/GrabberConfig/tasteofhome.com.txt new file mode 100644 index 00000000..f3234f34 --- /dev/null +++ b/data/GrabberConfig/tasteofhome.com.txt @@ -0,0 +1,11 @@ +title: //div[@id='ctl00_MainContent_ctl00_Div1']//h2 +body: //div[@id='ctl00_MainContent_ctl00_Div1'] + +single_page_link: //div[contains(@class, 'recipeHeader')]//a[contains(@href, '/print')] + +strip_image_src: tohPrintL.png + +prune: no + +test_url: http://www.tasteofhome.com/recipes/Grinch-Punch +test_url: http://www.tasteofhome.com/recipes/lactose-free-chocolate-chip-cookies
\ No newline at end of file diff --git a/data/GrabberConfig/taz.de.txt b/data/GrabberConfig/taz.de.txt new file mode 100644 index 00000000..b0d4d26f --- /dev/null +++ b/data/GrabberConfig/taz.de.txt @@ -0,0 +1,9 @@ +date: //li[@class='date'] +body: (//article[@class='sectbody'])[1] +title: concat(//article[@class='sectbody']/h4,': ',//article[@class='sectbody']/h1) +author: //a[@class='author']/h4 +strip: //p[@class='caption'] +strip_id_or_class: ad_bin +strip_id_or_class: rack + +test_url: https://www.taz.de/!5504959/ diff --git a/data/GrabberConfig/tbray.org.txt b/data/GrabberConfig/tbray.org.txt new file mode 100644 index 00000000..558dc9c8 --- /dev/null +++ b/data/GrabberConfig/tbray.org.txt @@ -0,0 +1,5 @@ +body: //div[@id='centercontent'] +strip: //div[@id='rightcontent'] +date: substring-before( //div[@id='cats'], '·') +title: //h1 +test_url: http://www.tbray.org/ongoing/When/201x/2012/03/04/Mobile-Money
\ No newline at end of file diff --git a/data/GrabberConfig/teamliquid.net.txt b/data/GrabberConfig/teamliquid.net.txt new file mode 100644 index 00000000..7cfea150 --- /dev/null +++ b/data/GrabberConfig/teamliquid.net.txt @@ -0,0 +1,20 @@ +# Author: Jan Lukas Gernert + +tidy: no +prune: no + +title: //div[@class='Newsheader']/h1 +#date: //time +#author: //a[@rel='author'] + +body: //div[@id='Newswrap'] +body: //div[@class='text'] +body: //article + +strip: //div[@id='Newscontents'] +strip: //div[@class='Newsbanner'] +strip: //style + + + +test_url: http://www.teamliquid.net/forum/starcraft-2/484380-code-s-ro32-group-c-on-the-way-back-s2-2015 diff --git a/data/GrabberConfig/tech.fortune.cnn.com.txt b/data/GrabberConfig/tech.fortune.cnn.com.txt new file mode 100644 index 00000000..da198622 --- /dev/null +++ b/data/GrabberConfig/tech.fortune.cnn.com.txt @@ -0,0 +1,4 @@ +title: //h1[@class='storyheadline'] +body: //div[@class='storytext'] +strip: //strong +test_url: http://tech.fortune.cnn.com/2011/03/17/why-startups-dont-go-public-anymore/?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29
\ No newline at end of file diff --git a/data/GrabberConfig/tech.sina.com.cn.txt b/data/GrabberConfig/tech.sina.com.cn.txt new file mode 100644 index 00000000..75126f9c --- /dev/null +++ b/data/GrabberConfig/tech.sina.com.cn.txt @@ -0,0 +1,11 @@ +title://h1[contains(@id,'artibodyTitle')] + +date://span[contains(@id,'pub_date')] + +body://div[contains(@id,'artibody')] + +strip://div[contains(@class,'otherContent')] + +next_page_link://p[@class='page']/a[contains(.,'下一页')] + +test_url: http://tech.sina.com.cn/mobile/n/2012-03-22/07476863046.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/techcrunch.com.txt b/data/GrabberConfig/techcrunch.com.txt new file mode 100644 index 00000000..177b3f2c --- /dev/null +++ b/data/GrabberConfig/techcrunch.com.txt @@ -0,0 +1,19 @@ +body: //div[@class='article-entry text'] + +author: //div[@class='byline']/a + +date: //div[@class="byline"]/time/@datetime + +title: //h1[@class="tweet-title"] +strip_id_or_class: module-crunchbase +strip_id_or_class: aside aside-related-articles + +# The following is for the mobile site +body: //div[@id="singlentry"] +author: substring-after(//span[@class="single-post-meta-top"],'rsaquo; ') +date: substring-before(//div[@class="single-post-meta-top"],' @') +title: //a[@class="sh2"] + +prune: no + +test_url: http://techcrunch.com/2016/02/02/spotcap/?ncid=rss&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29 diff --git a/data/GrabberConfig/techdirt.com.txt b/data/GrabberConfig/techdirt.com.txt new file mode 100644 index 00000000..7db2f95b --- /dev/null +++ b/data/GrabberConfig/techdirt.com.txt @@ -0,0 +1,12 @@ +body: //div[@class='story'] +title: //div[@class='story']/h1 +strip: //div[@class='story']/h1 + +author: //div[@class='details']/p[contains(., 'by ')]/a +date: //p[@class='storydate'] + +strip: //p[a[contains(., 'Leave a Comment')]] +strip_id_or_class: share +strip_id_or_class: maincolumn_head +strip_id_or_class: maincolmod +test_url: http://www.techdirt.com/articles/20120112/17455117394/sega-gets-it-right-about-sopa-its-time-hard-reset-copyright-law-congress.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/techhive.com.txt b/data/GrabberConfig/techhive.com.txt new file mode 100644 index 00000000..48ecf300 --- /dev/null +++ b/data/GrabberConfig/techhive.com.txt @@ -0,0 +1,18 @@ +title: //div[@class='articleHead']//h1 +author: //div[@class="author-name"]/a[1] +body: //div[@class="main"] + +# remove 'From the Lab' and 'Recent posts' text +strip: //div[@class='blogLabel'] + +# remove byline and meta info +strip: //div[@class="article-meta"] +strip: //div[@class="author-info"] + +#strip tags and categories +strip: //div[@class="department"] + +#strip product cap links +strip: //div[@class="cap-main"] +strip: //div[@id="compare-lede"] +test_url: http://www.techhive.com/article/3023383/streaming-media/vlc-for-apple-tv-review-goodbye-format-woes.html diff --git a/data/GrabberConfig/techmeme.com.txt b/data/GrabberConfig/techmeme.com.txt new file mode 100644 index 00000000..26eb37b0 --- /dev/null +++ b/data/GrabberConfig/techmeme.com.txt @@ -0,0 +1,3 @@ +single_page_link_in_feed: //b/a + +test_url: http://www.techmeme.com/feed.xml diff --git a/data/GrabberConfig/techno-science.net.txt b/data/GrabberConfig/techno-science.net.txt new file mode 100644 index 00000000..31dd7f4e --- /dev/null +++ b/data/GrabberConfig/techno-science.net.txt @@ -0,0 +1,3 @@ +title://div[@class="news"]/div[@class="titre"] +body://div[@class="news"]/div[@class="texte"] +test_url: http://www.techno-science.net/?onglet=news&news=14808 diff --git a/data/GrabberConfig/technologizer.com.txt b/data/GrabberConfig/technologizer.com.txt new file mode 100644 index 00000000..179bf5a6 --- /dev/null +++ b/data/GrabberConfig/technologizer.com.txt @@ -0,0 +1,5 @@ +next_page_link: //a[contains(., 'NEXT PAGE')] +# following::node() selects text nodes too whereas following::* selects only elements. +strip: //span[@class='pageo']/following::node() +strip: //span[@class='pageo'] +test_url: http://technologizer.com/2010/03/08/the-secret-origin-of-windows/
\ No newline at end of file diff --git a/data/GrabberConfig/technologyreview.com.txt b/data/GrabberConfig/technologyreview.com.txt new file mode 100644 index 00000000..0eecf441 --- /dev/null +++ b/data/GrabberConfig/technologyreview.com.txt @@ -0,0 +1,20 @@ +title: //header[@class='article-topper__title'] + +body: //section[contains(@class, 'body')] + +# Author & Date for News and Featured Stories +author: //ul[@class='byline']/li/a +author: substring-before(substring-after(//ul[@class='byline']/li, 'By '), ' on') +date: substring-after(//ul[@class='byline']/li, 'on ') + +# Author & Date for "Views" +author: //div[@class='view-byline']/div[@class='meta']/h2[1] +date: //div[@class='view-byline']/div[@class='meta']/h2[2] + +strip_id_or_class: l-article-list +strip_id_or_class: l-automated-related--single +strip_id_or_class: l-cta--left +strip_id_or_class: l-automated-trending--ordered + +next_page_link: //section[@class='pagination']/a[contains(@class, 'continue')] +test_url: http://www.technologyreview.com/news/427567/facebooks-telescope-on-human-behavior/ diff --git a/data/GrabberConfig/techpinions.com.txt b/data/GrabberConfig/techpinions.com.txt new file mode 100644 index 00000000..8e1aa96c --- /dev/null +++ b/data/GrabberConfig/techpinions.com.txt @@ -0,0 +1,7 @@ +body: //div[@class="post"] + +strip: //div[@class="post-meta"] +strip: //div[@id="socialicons"] +strip: //div[@id="authorbox"] + +test_url: http://techpinions.com/why-google-and-microsoft-hate-siri/3572
\ No newline at end of file diff --git a/data/GrabberConfig/techradar.com.txt b/data/GrabberConfig/techradar.com.txt new file mode 100644 index 00000000..0a0ca619 --- /dev/null +++ b/data/GrabberConfig/techradar.com.txt @@ -0,0 +1,12 @@ +# Title without news/reviews etc. appended +title: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/h1 + +# Remove home link +strip: //div[@id='page_logo']/a + +# Remove utilities +strip: //*[(@id = "utilities")] + +# Remove comments link +strip: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/p[@class='tiny'] +test_url: http://www.techradar.com/news/television/sky-to-rebrand-living-as-sky-living-903105
\ No newline at end of file diff --git a/data/GrabberConfig/techstage.de.txt b/data/GrabberConfig/techstage.de.txt new file mode 100644 index 00000000..1331f780 --- /dev/null +++ b/data/GrabberConfig/techstage.de.txt @@ -0,0 +1,9 @@ +strip_id_or_class: teaser +strip_id_or_class: a-pvg +strip_id_or_class: a-pvg-deal__price +strip_id_or_class: a-pvg__logo +strip_id_or_class: a-grid +strip_id_or_class: comments_container +strip: //a-tabs + +test_url: http://techstage.de/-4156945 diff --git a/data/GrabberConfig/ted.com.txt b/data/GrabberConfig/ted.com.txt new file mode 100644 index 00000000..fe71e9ea --- /dev/null +++ b/data/GrabberConfig/ted.com.txt @@ -0,0 +1,11 @@ +title: //title +body: //div[@class='talk-article__body talk-transcript__body'] | //div[@class='media__image media__image--thumb talk-link__image'] + +strip_id_or_class: talk-transcript__para__time + +single_page_link: //a[@id='hero-transcript-link'] + +#prune: no +tidy: no + +test_url: http://www.ted.com/talks/andrew_solomon_how_the_worst_moments_in_our_lives_make_us_who_we_are diff --git a/data/GrabberConfig/telegraaf.nl.txt b/data/GrabberConfig/telegraaf.nl.txt new file mode 100644 index 00000000..91b5baf9 --- /dev/null +++ b/data/GrabberConfig/telegraaf.nl.txt @@ -0,0 +1,9 @@ +body: //div[@id='artikelKolom'] +strip: //div[@class='broodMediaBox']/div[@class='docbox' or @class='artBannerWrapper'] +strip: //div[@id='artikeltoolbar'] +strip: //div[@class='reactiebalk artspacer' or @class='bannercenter clearfix artspacer'] +strip: //div[@id='artikelKolomRechts' or @id='TMGTweetWidget'] +tidy: no +prune: no + +test_url: http://www.telegraaf.nl/binnenland/10275097/__Identiteit_man_in_sloot_onbekend__.html?cid=rss
\ No newline at end of file diff --git a/data/GrabberConfig/telegraph.co.uk.txt b/data/GrabberConfig/telegraph.co.uk.txt new file mode 100644 index 00000000..8dcdb42b --- /dev/null +++ b/data/GrabberConfig/telegraph.co.uk.txt @@ -0,0 +1,10 @@ +body: //div[@class='byline' or @id='storyEmbSlide' or @id='mainBodyArea'] +strip: //p[@class='comments'] +strip: //div[@id='storyEmbSlide']//div[contains(@class, "hide")] +strip: //div[@id='tmg-related-links' or @id='outbrain-related-links' or @id='onespot-related-links'] +strip: //p[@class='bbpTweet']/span[@class='timestamp'] +strip: //p[@class='bbpTweet']/span[@class='metadata']//img +tidy: no +prune: no + +test_url: http://www.telegraph.co.uk/news/worldnews/europe/ireland/8663451/Is-Ireland-divorcing-from-the-Catholic-Church.html
\ No newline at end of file diff --git a/data/GrabberConfig/telerama.fr.txt b/data/GrabberConfig/telerama.fr.txt new file mode 100644 index 00000000..5eceed93 --- /dev/null +++ b/data/GrabberConfig/telerama.fr.txt @@ -0,0 +1,5 @@ +author: //div[@itemprop="author"]//img/@alt +strip: //div[@id="article--sidebar-right"] +strip: //div[@class="article--tags"] + +test_url: https://www.telerama.fr/medias/peertube,-la-plateforme-qui-defie-youtube-avec-lethique,n5715829.php diff --git a/data/GrabberConfig/temoignagechretien.fr.txt b/data/GrabberConfig/temoignagechretien.fr.txt new file mode 100644 index 00000000..fad4590a --- /dev/null +++ b/data/GrabberConfig/temoignagechretien.fr.txt @@ -0,0 +1,16 @@ + +title: //meta[@property='og:title']/@content +author: //span[contains(concat(' ',normalize-space(@class),' '),' auteur ')] +body: //article[contains(concat(' ',normalize-space(@class),' '),' node-article ')] + +strip_id_or_class: comments +strip_id_or_class: comment-add +strip_id_or_class: field-tags +strip_id_or_class: field-genres +strip_id_or_class: field-source +strip_id_or_class: field-categories +strip_id_or_class: auteur-date-share +strip_id_or_class: node-produit +strip: //article[contains(concat(' ',normalize-space(@class),' '),' node-article ')]/h1[1] + +test_url: https://temoignagechretien.fr/articles/culture/les-mondes-de-youssou-ndour diff --git a/data/GrabberConfig/the-magazine.org.txt b/data/GrabberConfig/the-magazine.org.txt new file mode 100644 index 00000000..08864657 --- /dev/null +++ b/data/GrabberConfig/the-magazine.org.txt @@ -0,0 +1,3 @@ +tidy: no + +test_url: http://the-magazine.org/1/alone-together-again
\ No newline at end of file diff --git a/data/GrabberConfig/the-scientist.com.txt b/data/GrabberConfig/the-scientist.com.txt new file mode 100644 index 00000000..fec5e893 --- /dev/null +++ b/data/GrabberConfig/the-scientist.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.the-scientist.com%2F%3Farticles.view%2FarticleNo%2F51260%2Ftitle%2FScientists-Continue-to-Use-Outdated-Methods%2F + +body: //div[@id='siteInnerContent'] +test_url: https://www.the-scientist.com/?articles.view/articleNo/51260/title/Scientists-Continue-to-Use-Outdated-Methods/
\ No newline at end of file diff --git a/data/GrabberConfig/theage.com.au.txt b/data/GrabberConfig/theage.com.au.txt new file mode 100644 index 00000000..ea27c314 --- /dev/null +++ b/data/GrabberConfig/theage.com.au.txt @@ -0,0 +1,5 @@ +author: //h3[@class='authorName'] +date: //time +body: //div[@class='articleBody'] +strip_id_or_class: adspot +test_url: http://www.theage.com.au/victoria/top-cops-warns-outlaw-bikies-we-have-a-gang-too-20130331-2h1l8.html
\ No newline at end of file diff --git a/data/GrabberConfig/theamericanscholar.org.txt b/data/GrabberConfig/theamericanscholar.org.txt new file mode 100644 index 00000000..38b96672 --- /dev/null +++ b/data/GrabberConfig/theamericanscholar.org.txt @@ -0,0 +1,13 @@ +# Article Metadata +title: //meta[@property="og:title"]/@content +author: substring-after(//h3, 'By ') +date: //h4/a[2] + +# Content Pruning +strip: //h4 +strip: //a[@id="print_button"] +strip: //p[@class="excerpt"] +strip: //h3 +strip: //div[@class="caption"] +strip: //center/a/img +test_url: http://theamericanscholar.org/too-big-to-fail-and-too-risky-to-exist/
\ No newline at end of file diff --git a/data/GrabberConfig/theatlantic.com.txt b/data/GrabberConfig/theatlantic.com.txt new file mode 100644 index 00000000..8cf1920a --- /dev/null +++ b/data/GrabberConfig/theatlantic.com.txt @@ -0,0 +1,34 @@ +title: //meta[@property='og:title']/@content + +#title: //div[contains(@class, 'articleHead')]//h1 + +# single_page_link: //link[contains(@href, '/amp/article/')] + +body: //div[@id='main-article'] + +body: //div[@itemprop='articleBody'] +body: //div[@class='articleText'] +body: //div[@class='articleContent'] +body: //div[@id='article'] +date: //*[contains(@class, 'date')] +author: //div[@id='profile']//*[@class='authors']//a[1] +author: //*[@class='author']/span +prune: no + +strip: //div[@class='moreOnBoxWithImages'] +strip: //p[contains(., 'This article available online at:')] +strip: //p[contains(., 'This article available online at:')]/following::* +strip: //div[@class='earthbox'] + +#single_page_link: //div[contains(@class, 'article-tools')]//a[contains(@class, 'print')] + +native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')] + +#multi-page article (not multi-page anymore) +test_url: http://www.theatlantic.com/magazine/archive/2014/12/the-real-roots-of-midlife-crisis/382235/ +test_contains: The curve tends to evince itself + +test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/ +test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/ +test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/ +test_url: http://www.theatlantic.com/technology/archive/2013/06/fix-things-never-force-it-lessons-from-grandpa/276873/ diff --git a/data/GrabberConfig/theatlanticcities.com.txt b/data/GrabberConfig/theatlanticcities.com.txt new file mode 100644 index 00000000..880f207d --- /dev/null +++ b/data/GrabberConfig/theatlanticcities.com.txt @@ -0,0 +1,17 @@ +# To administrator: +# Please replace the hostname with "*.theatlanticcities.com" + +# This filter is tested on: +# http://m.theatlanticcities.com/arts-and-lifestyle/2012/04/invisible-borders-define-american-culture/1839/ +# http://www.theatlanticcities.com/housing/2012/11/chinas-holdouts/3981/ +# http://www.theatlanticcities.com/arts-and-lifestyle/2012/12/christmas-time-here/4133/ + +title://h1 +author: //ul[@class='meta']/li/a +date: //ul[@class='meta']/li/following-sibling::li +body://article[@class='post'] + +strip://h1 +strip://ul[@class='meta'] +strip://div[@class='newsletter-slug'] +test_url: http://www.theatlanticcities.com/arts-and-lifestyle/2012/12/christmas-time-here/4133/
\ No newline at end of file diff --git a/data/GrabberConfig/thecut.com.txt b/data/GrabberConfig/thecut.com.txt new file mode 100644 index 00000000..017484cb --- /dev/null +++ b/data/GrabberConfig/thecut.com.txt @@ -0,0 +1,18 @@ +#copied from nymag.com.txt + +title: //h2[contains(@class, 'primary')] +body: //*[@itemprop="articleBody"] +body: //div[@id='story'] +author: //*[@class='by']/a +date: substring-after(//*[@class='date'], 'Published') + +#Skip GDPR warning +http_header(Cookie): nymuc=11111111111 + +parser: html5php +tidy: no + +next_page_link: //div[@class='page-navigation']//li[@class='next']/a + +test_url: https://www.thecut.com/2018/06/trump-administration-says-it-has-a-family-reunification-plan.html +test_contains: back to their home country diff --git a/data/GrabberConfig/thedailybeast.com.txt b/data/GrabberConfig/thedailybeast.com.txt new file mode 100644 index 00000000..a31d0af9 --- /dev/null +++ b/data/GrabberConfig/thedailybeast.com.txt @@ -0,0 +1,10 @@ +body: //div[contains(@class, 'ArticleBody')] +strip_id_or_class: share +strip_id_or_class: Share +strip_id_or_class: footer +strip_id_or_class: Footer +strip_id_or_class: Newsletter +prune: no +test_url: http://www.thedailybeast.com/articles/2017/04/01/michael-flynn-failed-to-disclose-payments-from-russian-propaganda-network.html +test_url: http://www.thedailybeast.com/articles/2010/04/06/how-mastercard-predicts-divorce.html +test_contains: people who are going through a divorce are more likely to miss payments diff --git a/data/GrabberConfig/thedailymash.co.uk.txt b/data/GrabberConfig/thedailymash.co.uk.txt new file mode 100644 index 00000000..a83a6cf6 --- /dev/null +++ b/data/GrabberConfig/thedailymash.co.uk.txt @@ -0,0 +1,14 @@ +# Remove duplicated title +strip: //div[@id='content']/div[1][@class='full_intro']/h2 + +# Remove links, ads etc. +strip: //*[(@class= "aside")] + +# Remove the date and add it to the date published field in Instapaper +strip: //div[@class="date"] +date: //div[@class="date"] + +# There is no byline on The Daily Mash. + +convert_double_br_tags: yes +test_url: http://www.thedailymash.co.uk/index.php?option=com_content&task=view&id=4994&Itemid=81&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thedailymash+%28The+Daily+Mash.+It%27s+news+to+us.%29
\ No newline at end of file diff --git a/data/GrabberConfig/thedisneyblog.com.txt b/data/GrabberConfig/thedisneyblog.com.txt new file mode 100644 index 00000000..57b3254a --- /dev/null +++ b/data/GrabberConfig/thedisneyblog.com.txt @@ -0,0 +1,7 @@ +title: //h1[contains(@class, 'entry-title')] +author: //span[contains(@class, 'author vcard')] +date: //span[@class = 'entry-date'] +body: //div[@class='entry-content'] +strip_id_or_class: bottomcontainerBox +strip_id_or_class: lightsocial_container +test_url: http://thedisneyblog.com/2012/11/17/videopolis-one-woman-disney-musical-beauty-and-the-beast/
\ No newline at end of file diff --git a/data/GrabberConfig/thedrive.com.txt b/data/GrabberConfig/thedrive.com.txt new file mode 100644 index 00000000..ecb9d115 --- /dev/null +++ b/data/GrabberConfig/thedrive.com.txt @@ -0,0 +1,5 @@ +body: //div[@class='has-ad-column'] + +strip: //div[contains(@class, 'articleFragment') and not(contains(@class, 'paragraph'))] + +test_url: http://www.thedrive.com/new-cars/12579/why-this-4000-renault-is-as-disruptive-as-the-tesla-model-3 diff --git a/data/GrabberConfig/thefilmexperience.net.txt b/data/GrabberConfig/thefilmexperience.net.txt new file mode 100644 index 00000000..e6b5115a --- /dev/null +++ b/data/GrabberConfig/thefilmexperience.net.txt @@ -0,0 +1,2 @@ +body: //div[@class='body'] +test_url: http://thefilmexperience.net/blog/2011/12/30/distant-relatives-2001-a-space-odyssey-and-the-tree-of-life.html
\ No newline at end of file diff --git a/data/GrabberConfig/thegamedesignforum.com.txt b/data/GrabberConfig/thegamedesignforum.com.txt new file mode 100644 index 00000000..849ede77 --- /dev/null +++ b/data/GrabberConfig/thegamedesignforum.com.txt @@ -0,0 +1,14 @@ +## ERROR: Removes all images. Please fix, have no idea why (bad HTML?) + +title: //h1[@class='featuretitle'] +body: //div[@id='nobordercontentarea'] + +# remove Twitter badge +strip: //img[@alt='Follow tgdfweb on Twitter'] + +# fix for headers not showing for some reason +wrap_in(h2): //h2[@class='sectionheader'] +dissolve: //h2[@class='sectionheader'] + +tidy: yes +test_url: http://thegamedesignforum.com/features/acceleration_flow_1.html
\ No newline at end of file diff --git a/data/GrabberConfig/thegap.at.txt b/data/GrabberConfig/thegap.at.txt new file mode 100644 index 00000000..5bf8ce50 --- /dev/null +++ b/data/GrabberConfig/thegap.at.txt @@ -0,0 +1,41 @@ +prune: no +tidy: no + +title: //h1 +strip: //h1 + +date: //span[contains(concat(' ', normalize-space(@class), ' '), ' date ')] +strip: //span[contains(concat(' ', normalize-space(@class), ' '), ' date ')] + +# Article + +author: normalize-space(substring-after(//div[@class='artikel']/p[1], 'von ')) +strip: //div[@class='artikel']/p[1] + +next_page_link: //a[@class='but weiter']/@href +strip: //div[@class='browsetext'] + +body: //div[@class='artikel'] +strip: //h5 +strip: //div[@class='copyrights'] +strip: //div[@class='textbox'] +strip: (//div[@class='artikel']//br)[1] +strip: //div[@class='clear'] +strip: //p[starts-with(., 'Weiter zu:')] +strip: //a[@name='minislide'] +strip: //div[@class='kommentare'] + +# Slideshows + +author: normalize-space(substring-after(//div[@id='normal']/p[1], 'von ')) +strip: //div[@id='normal']/p[1] + +body: //div[@id='normal'] +next_page_link: //a[@class='next']/@href +strip: //a[@class='next'] +strip: //a[@class='prev'] +strip: (//div[@id='normal']//br)[1] + +test_url: http://www.thegap.at/rubriken/stories/artikel/lecko-mio/ +test_url: http://www.thegap.at/rubriken/stories/artikel/die-frauen-im-arkadenhof/ +test_url: http://www.thegap.at/rubriken/stories/artikel/nothilfe-im-wandel/ diff --git a/data/GrabberConfig/theglobalmail.org.txt b/data/GrabberConfig/theglobalmail.org.txt new file mode 100644 index 00000000..da1c84f9 --- /dev/null +++ b/data/GrabberConfig/theglobalmail.org.txt @@ -0,0 +1,41 @@ +title: //h1[@id="headline"] +author: //div[contains(@class, "editorial-byline-author")]/a +date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ") + +# The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed +body: //div[@id="template"] +strip_id_or_class: editorial-byline-pic +strip_id_or_class: editorial-byline +strip_id_or_class: headline + +# Include the leadin paragraph in the body text, but remove quotes because they're out of context +dissolve: //div[contains(@id, "leadin")] +strip_id_or_class: pullquote + +# Image captions removed because they're confusing in body text +strip_id_or_class: image-caption-content + +# Remove header and footer +strip_id_or_class: header +strip_id_or_class: footer + +# Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image +strip: /html/body/span[contains(@style, "display: none")] + +# Remove search box +strip_id_or_class: searchContainer +strip: //div[contains(@class, "searchInstruction")] +strip: //div[contains(@class, "searchResults")]/h4 + +# Remove the 'Letters to the Editor' section +strip_id_or_class: letter-text +strip_id_or_class: letter-from +strip_id_or_class: letter-date + +# Remove Like/Tweet links +strip_id_or_class: social-tab + +# Remove 'divider' which causes an inexplicable slash to appear in the article body +strip_id_or_class: divider + +test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/
\ No newline at end of file diff --git a/data/GrabberConfig/theglobeandmail.com.txt b/data/GrabberConfig/theglobeandmail.com.txt new file mode 100644 index 00000000..2473cad2 --- /dev/null +++ b/data/GrabberConfig/theglobeandmail.com.txt @@ -0,0 +1,10 @@ +body: //div[contains(@class, 'entry-content')]//div[contains(@class, 'column-2')] +single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')] +strip_id_or_class: entry-related +strip_id_or_class: entry-sidebar +strip_id_or_class: entry-pagination +tidy: no +prune: no + +test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/ +test_url: http://www.theglobeandmail.com/report-on-business/industry-news/energy-and-resources/cliffs-natural-resources-looking-to-exit-ontarios-ring-of-fire/article20651617/
\ No newline at end of file diff --git a/data/GrabberConfig/thegreatdiscontent.com.txt b/data/GrabberConfig/thegreatdiscontent.com.txt new file mode 100644 index 00000000..12442b40 --- /dev/null +++ b/data/GrabberConfig/thegreatdiscontent.com.txt @@ -0,0 +1,6 @@ +title: //h1[@id='headline'] +author: substring-after(//section[@class="credits"]/ul/li[1],"Interview by ") +date: //time[@pubdate] +body: //article[@class='interview'] +strip: //article[@class='interview']/footer +test_url: http://thegreatdiscontent.com/jeffrey-zeldman
\ No newline at end of file diff --git a/data/GrabberConfig/theguardian.com.txt b/data/GrabberConfig/theguardian.com.txt new file mode 100644 index 00000000..5adbfa15 --- /dev/null +++ b/data/GrabberConfig/theguardian.com.txt @@ -0,0 +1,51 @@ +body: //article +body: //div[contains(concat(' ',normalize-space(@class),' '),' content__main ')]//div[contains(concat(' ',normalize-space(@class),' '),' gs-container ')] +strip: //article//div[contains(@class, 'content__secondary-column')] +strip: //article//aside +strip: //article//div[contains(@class, 'block-share')] +strip: //article//div[@class='submeta'] +strip: //article//span[contains(@class, 'inline-expand-image')] +strip: //article//div[@class='kindleWidget'] +strip: //article//div[@class='email-subscription'] +strip: //article//script +strip: //article//figure[contains(@class, 'element-audio')] +strip: //article//a[contains(@style, 'display: none')] +strip: //article/div[@class='paidfor-band'] +strip: //header +strip: //div[@class='content__main-column'] +strip: //div[contains(@class, 'content-footer')] +strip: //footer + +strip_id_or_class: hide-on-mobile +strip_id_or_class: reveal-caption +strip_id_or_class: readerquestions +strip_id_or_class: submeta + +strip: //figcaption + + +author: //article//p[@class='byline'] +date: //article//time/@datetime +strip: //article//div[contains(@class, 'content__meta-container')] + +native_ad_clue: //meta[@property='article:tag' and contains(@content, 'partner zone')] +native_ad_clue: //meta[@property='video:tag' and contains(@content, 'partner zone')] + +prune: no +tidy: no + +test_url: http://www.theguardian.com/world/2013/oct/04/nsa-gchq-attack-tor-network-encryption +test_contains: The National Security Agency has made repeated attempts to develop +test_contains: The agency did not directly address those questions, instead providing a statement. + +test_url: http://www.theguardian.com/world/2013/oct/03/edward-snowden-files-john-lanchester +test_contains: In August, the editor of the Guardian rang me up and asked if I would spend a week in New York +test_contains: As the second most senior judge in the country, Lord Hoffmann, said in 2004 about a previous version of our anti-terrorism laws + +test_url: http://www.theguardian.com/commentisfree/2014/jun/15/britishness-search-identity-my-part-in-camerons-odyssey +test_url: http://www.theguardian.com/world/2016/feb/17/ankara-explosion-turkey-injures-large-number-of-people-reports-say +test_url: http://www.theguardian.com/uk-news/2016/feb/11/trident-the-british-question +test_url: https://www.theguardian.com/books/live/2016/oct/13/nobel-prize-in-literature-2016-liveblog + +# Native ad +test_url: http://www.theguardian.com/sustainable-business/fairtrade-partner-zone/chocolate-cocoa-production-risk diff --git a/data/GrabberConfig/theindychannel.com.txt b/data/GrabberConfig/theindychannel.com.txt new file mode 100644 index 00000000..2cd865bb --- /dev/null +++ b/data/GrabberConfig/theindychannel.com.txt @@ -0,0 +1,13 @@ +title: //h1[@class="Headline"] +date: substring-after(//div[@class="posted"], 'EDT ') +body: //div[@class="storyBody"] + +strip: //td[@class="AssocContentTD"] +strip: //div[@id="pageTitle"] +strip: //div[@class="posted"] +strip: //div[@class="updated"] +strip: //div[@class="js-kit-disclaimer"] +strip: //table[@class="row3table"] +strip: //div[@class="container2"] +strip: //div[@id="delta"] +test_url: http://www.theindychannel.com/news/31050840/detail.html
\ No newline at end of file diff --git a/data/GrabberConfig/theintercept.com.txt b/data/GrabberConfig/theintercept.com.txt new file mode 100644 index 00000000..51f87d43 --- /dev/null +++ b/data/GrabberConfig/theintercept.com.txt @@ -0,0 +1,4 @@ +title: //h1[@class="Headline"] +body: //div[@class="PostContent"] + +test_url: https://theintercept.com/2014/10/30/inside-story-matt-taibbis-departure-first-look-media/ diff --git a/data/GrabberConfig/themarker.com.txt b/data/GrabberConfig/themarker.com.txt new file mode 100644 index 00000000..141b1a3b --- /dev/null +++ b/data/GrabberConfig/themarker.com.txt @@ -0,0 +1,11 @@ +title: //h1[contains(@class, 'mainTitle')] +author: //ul[@class='author']//a[@rel='author'] +body: //div[@id='article-box'] +prune: no +tidy: no +strip_id_or_class: head +strip_id_or_class: social-nav +strip_id_or_class: rate +strip_id_or_class: video + +test_url: http://www.themarker.com/markerweek/1.2093167
\ No newline at end of file diff --git a/data/GrabberConfig/themillions.com.txt b/data/GrabberConfig/themillions.com.txt new file mode 100644 index 00000000..5039ad0c --- /dev/null +++ b/data/GrabberConfig/themillions.com.txt @@ -0,0 +1,10 @@ +title: //h1[@class='entry-title'] + +body: //div[@class='the-content'] + +strip: //div[@class='author-description'] +strip: //div[@class='the-content']/p/small +strip_id_or_class: support_millions_single + +test_url: https://themillions.com/2018/01/regarding-the-em-dash.html +test_url: http://www.themillions.com/2010/07/at-the-movies-with-david-mitchell-the-thousand-autumns-of-jacob-de-zoet.html
\ No newline at end of file diff --git a/data/GrabberConfig/thenation.com.txt b/data/GrabberConfig/thenation.com.txt new file mode 100644 index 00000000..dab17f0b --- /dev/null +++ b/data/GrabberConfig/thenation.com.txt @@ -0,0 +1,13 @@ +title: //h2[@property='dc:title'] +#body: //div[@class='print-content'] +body: //div[@id='wysiwyg'] +author: //a[contains(@href, '/authors')] +author: substring-before(//div[@class='print-created'], '|') +date: //span[@class='article-date'] +date: substring-after(//div[@class='print-created'], '|') +prune: no + +#single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '/print/article/')] +single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '?page=full')] + +test_url: http://www.thenation.com/article/162331/hard-against-time-roy-fisher
\ No newline at end of file diff --git a/data/GrabberConfig/thenetworkgarden.blogs.com.txt b/data/GrabberConfig/thenetworkgarden.blogs.com.txt new file mode 100644 index 00000000..b7f5f0f0 --- /dev/null +++ b/data/GrabberConfig/thenetworkgarden.blogs.com.txt @@ -0,0 +1,4 @@ +body: //div[@id="beta-inner"] +title: //h3[@class="entry-header"] + +test_url: http://thenetworkgarden.blogs.com/weblog/2011/09/microsoft-metro-and-the-next-wave-in-computing.html
\ No newline at end of file diff --git a/data/GrabberConfig/thenews.coop.txt b/data/GrabberConfig/thenews.coop.txt new file mode 100644 index 00000000..2d3e95ed --- /dev/null +++ b/data/GrabberConfig/thenews.coop.txt @@ -0,0 +1,3 @@ +body: //div[@itemprop='articleBody'] + +test_url: http://www.thenews.coop/98221/news/co-operatives/jeremy-corbyn-needs-co-op-movement/ diff --git a/data/GrabberConfig/thenextgeneration.org.txt b/data/GrabberConfig/thenextgeneration.org.txt new file mode 100644 index 00000000..dedd989f --- /dev/null +++ b/data/GrabberConfig/thenextgeneration.org.txt @@ -0,0 +1,8 @@ +title: //h1[@class='interior-page-title'] +author: //span[@class='author']/a +date: //div[@class='byline']/time +body: //div[@class='rich-text-body'] + +strip: //div[@class='byline'] +strip: //div[@class='offscreen-menu'] +test_url: http://thenextgeneration.org/blog/post/rebrand-announce/
\ No newline at end of file diff --git a/data/GrabberConfig/thenextweb.com.txt b/data/GrabberConfig/thenextweb.com.txt new file mode 100644 index 00000000..583b725b --- /dev/null +++ b/data/GrabberConfig/thenextweb.com.txt @@ -0,0 +1,16 @@ +title: //h1[@class='title'] +body: //div[@class='l-postSingle-content-main-video'] | //div[contains(concat(' ',normalize-space(@class),' '),' l-postSingle-content-main-content ')] +author: //a[@class='author'] +date: //div[@class='meta']/time/@datetime + +strip: //div[contains(concat(' ',normalize-space(@class),' '),' l-postSingle-content-bottom-share ')] +strip_id_or_class: articleShare-buttons +strip_id_or_class: ica-wrapper +strip_id_or_class: post-author-bio +strip_id_or_class: post-author-trivia +replace_string(data-original=): src= + +tidy: no + +test_url: http://thenextweb.com/apple/2011/10/12/tnw-review-a-complete-guide-to-apples-ios-5-with-icloud-an-os-14-years-in-the-making/ +test_url: http://thenextweb.com/insider/2015/12/06/pornhubs-christmas-ad-wants-you-to-give-premium-subscriptions-to-your-grandparents/ diff --git a/data/GrabberConfig/theoaklandpress.com.txt b/data/GrabberConfig/theoaklandpress.com.txt new file mode 100644 index 00000000..c9abda71 --- /dev/null +++ b/data/GrabberConfig/theoaklandpress.com.txt @@ -0,0 +1,3 @@ +body: //div[@id='fullstory'] +strip: //div[@id='page_leftbar'] +test_url: http://theoaklandpress.com/articles/2011/04/25/news/doc4db5330e0bce9220005852.txt
\ No newline at end of file diff --git a/data/GrabberConfig/theonion.com.txt b/data/GrabberConfig/theonion.com.txt new file mode 100644 index 00000000..90e8d658 --- /dev/null +++ b/data/GrabberConfig/theonion.com.txt @@ -0,0 +1,11 @@ +title: //h2[@class='title'] +date: substring-before(//p[@class='meta'], '|') +body: //div[@class='story'] +#body: //div[@class='article_body'] + +strip: //h2[@class='title'] +strip: //p[@class='meta'] +strip: //div[@class='ga_section'] +strip: //div[@id='recent_slider'] + +test_url: http://www.theonion.com/articles/pathetic-bobcats-owner-again-regaling-players-with,27572/
\ No newline at end of file diff --git a/data/GrabberConfig/theoutline.com.txt b/data/GrabberConfig/theoutline.com.txt new file mode 100644 index 00000000..79326de0 --- /dev/null +++ b/data/GrabberConfig/theoutline.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Ftheoutline.com%2Fpost%2F6408%2Fmaybe-its-time-a-star-is-born-campaign-song + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post__body ')] +test_url: https://theoutline.com/post/6408/maybe-its-time-a-star-is-born-campaign-song diff --git a/data/GrabberConfig/thepioneerwoman.com.txt b/data/GrabberConfig/thepioneerwoman.com.txt new file mode 100644 index 00000000..75583cd3 --- /dev/null +++ b/data/GrabberConfig/thepioneerwoman.com.txt @@ -0,0 +1,11 @@ +title: //h1[@class='post-title'] +body: //div[@class='post'] +author: //p[@class='posted-by'] +date: //div[@class='sprite post-date'] + +# The body of the post doesn't have it's own div so we have to strip out the metadata +strip: //div[@class='author_avatar'] +strip: //div[@class='sprite post-date'] +strip: //h1[@class='post-title'] +strip: //p[@class='posted-by'] +test_url: http://thepioneerwoman.com/cooking/2011/08/pie-fats-a-comparison/
\ No newline at end of file diff --git a/data/GrabberConfig/thepointmag.com.txt b/data/GrabberConfig/thepointmag.com.txt new file mode 100644 index 00000000..2a822cde --- /dev/null +++ b/data/GrabberConfig/thepointmag.com.txt @@ -0,0 +1,5 @@ +body: //div[@class='article'] +strip_id_or_class: z-max +strip_id_or_class: readLaterMenu + +test_url: https://thepointmag.com/2016/examined-life/can-liberal-education-save-the-sciences diff --git a/data/GrabberConfig/theregister.co.uk.txt b/data/GrabberConfig/theregister.co.uk.txt new file mode 100644 index 00000000..70d3d437 --- /dev/null +++ b/data/GrabberConfig/theregister.co.uk.txt @@ -0,0 +1,9 @@ +single_page_link: //link[contains(@href, 'm.theregister')] +if_page_contains: //div[@id='nextpage'] +strip: //div[@class='wptl btm'] +body: //div[contains(@class,'article_head')]//h2 | //div[@id='body'] + +#multipage +test_url: http://www.theregister.co.uk/2015/07/06/geeks_guide_spaceguard_center/ +#singlepage +test_url: http://www.theregister.co.uk/2015/07/06/us_japan_massive_robots_in_the_ring/ diff --git a/data/GrabberConfig/therumpus.net.txt b/data/GrabberConfig/therumpus.net.txt new file mode 100644 index 00000000..84d0e783 --- /dev/null +++ b/data/GrabberConfig/therumpus.net.txt @@ -0,0 +1,4 @@ +title: /html/body/div/div[2]/div/div/h1 + +body: /html/body/div/div[2]/div/div/div[2] +test_url: http://therumpus.net/2010/07/the-rumpus-interview-with-david-means/?full=yes
\ No newline at end of file diff --git a/data/GrabberConfig/theses.enc.sorbonne.fr.txt b/data/GrabberConfig/theses.enc.sorbonne.fr.txt new file mode 100644 index 00000000..55a7f7d9 --- /dev/null +++ b/data/GrabberConfig/theses.enc.sorbonne.fr.txt @@ -0,0 +1,5 @@ +title: //h1[@class="head"] +author: //div[@class="name"] +body: //article[@id="text"] + +test_url: http://theses.enc.sorbonne.fr/2014/sidre diff --git a/data/GrabberConfig/thesimpledollar.com.txt b/data/GrabberConfig/thesimpledollar.com.txt new file mode 100644 index 00000000..dcdf2572 --- /dev/null +++ b/data/GrabberConfig/thesimpledollar.com.txt @@ -0,0 +1,4 @@ +title: //h3[@class='post-title']/a[@class='post-title-link'] +body: //div[@class='post-content'] +author: //div[@class='post-meta-under-title']/a +test_url: http://www.thesimpledollar.com/2011/09/13/determining-the-size-of-your-emergency-fund/
\ No newline at end of file diff --git a/data/GrabberConfig/thespoof.com.txt b/data/GrabberConfig/thespoof.com.txt new file mode 100644 index 00000000..f71cfb6b --- /dev/null +++ b/data/GrabberConfig/thespoof.com.txt @@ -0,0 +1,9 @@ +title: //h1[contains(@class, 'cTitle')] +body: //div[contains(@class, 'KonaBody') or @id='articleimageright'] +author: //meta[@name='Author']/@content +date: //meta[@name='OriginalPublicationDate']/@content + +prune: no +tidy: no + +test_url: http://www.thespoof.com/news/spoof.cfm?headline=s8i108389
\ No newline at end of file diff --git a/data/GrabberConfig/thestranger.com.txt b/data/GrabberConfig/thestranger.com.txt new file mode 100644 index 00000000..6fcf4fdf --- /dev/null +++ b/data/GrabberConfig/thestranger.com.txt @@ -0,0 +1,12 @@ +# savage* filtering is for Savage Love, such as: http://www.thestranger.com/seattle/SavageLove?oid=5135029 + +#other filtering are plain articles, such as: http://www.thestranger.com/seattle/the-stranger-election-control-board/Content?oid=5142885 + +title: //div[@id='savageColumn_head']/h1 +title: //h1[@class="headlineLarge"] + +strip: //div[@id='savage_right'] | //div[@id='savageColumn_head'] | //div[@id='savageArticleRight'] | //div[@id='articleRight'] | //div[@class='savAppBanner'] + +body: //div[@id='savageColumn'] +body: //div[@id='story_text'] +test_url: http://www.thestranger.com/seattle/SavageLove?oid=5135029
\ No newline at end of file diff --git a/data/GrabberConfig/thestreet.com.txt b/data/GrabberConfig/thestreet.com.txt new file mode 100644 index 00000000..58eabf00 --- /dev/null +++ b/data/GrabberConfig/thestreet.com.txt @@ -0,0 +1,25 @@ +title: //div[@id='storyHdr']/h1 +title: //div[@id='print']//h2 +body: //div[@class="virtualpage"] +body: //div[@id='print']//div[@id='bd'] +author: //meta[@name="AUTHOR"]/@content +author: (//div[@id='print']//div[@id='bd']/h4)[1] +date: //meta[@name="DATE"]/@content +date: //div[@id='print']//div[@id='dte'] + +strip_id_or_class: articleFooter +strip_id_or_class: sidebar +strip_id_or_class: ie6PrintSubhead +strip_id_or_class: subHdr + + +replace_string(<P/>): </p><p> + +prune: no + +#TODO: redirects back - perhaps needs referer to work +single_page_link: //div[@id='storyDetail']//a[contains(@href, '/print/')] + +test_url: http://www.thestreet.com/story/11386556/1/which-of-these-10-dividend-stocks-is-worth-the-risk.html +# multi page +test_url: http://www.thestreet.com/story/11387090/1/7-ubs-stock-picks-for-2012.html
\ No newline at end of file diff --git a/data/GrabberConfig/thethaovanhoa.vn.txt b/data/GrabberConfig/thethaovanhoa.vn.txt new file mode 100644 index 00000000..6b3277eb --- /dev/null +++ b/data/GrabberConfig/thethaovanhoa.vn.txt @@ -0,0 +1,2 @@ +strip:/html/body/form/div[2]/div[3]/div/div/div/div/div/div/div/div/div/div[2]/div[3]/div[2]/div/p[2] +test_url: http://thethaovanhoa.vn/151N20110519085606745T129/levante-quyet-giu-caicedo.htm
\ No newline at end of file diff --git a/data/GrabberConfig/theverge.com.txt b/data/GrabberConfig/theverge.com.txt new file mode 100644 index 00000000..86cb8c7e --- /dev/null +++ b/data/GrabberConfig/theverge.com.txt @@ -0,0 +1,50 @@ +author: //meta[@name="author"]/@content +title: //meta[@property="og:title"]/@content +date: //meta[@property="article:published_time"]/@content + +# //picture selector seems to cause problems with text extraction. +# body: //picture[contains(@class, 'c-picture')] | //div[contains(@class, 'c-entry-content') or contains(@class, 'c-entry-hero__image')] +body: //div[contains(@class, 'c-entry-content') or contains(@class, 'c-entry-hero__image')] +# for vergecasts, e.g. http://www.theverge.com/2013/8/22/4648566/the-vergecast-090-august-22th-2013-video +body: //article +body: //div[contains(concat(' ',normalize-space(@class),' '),' l-col__main ')] + +strip: //aside +strip: //nav + +strip_id_or_class: gallery +strip_id_or_class: article-meta +strip_id_or_class: story-navigation +strip_id_or_class: slegend +strip_id_or_class: related-product-meta +strip_id_or_class: comments +strip_id_or_class: ui-jump-list +strip_id_or_class: pullquote +strip_id_or_class: m-ad +strip_id_or_class: social-sharing +strip_id_or_class: m-video-entry__excerpt +strip_id_or_class: hidden +strip_id_or_class: m-article__follow-bar +strip_id_or_class: m-article__share-buttons +strip_id_or_class: l-col__sidebar +strip_id_or_class: c-river +strip_id_or_class: chorus-ad-placement +strip_id_or_class: c-related-list + +#2017 +strip_id_or_class: e-image__meta +replace_string(<strong> </strong>): <!-- nothing --> + +replace_string(<noscript>): <div> +replace_string(</noscript>): </div> + +parser: html5php + +test_url: http://www.theverge.com/2017/1/12/14244634/signal-app-secure-messaging-trump-surveillance-encryption +test_contains: While he now lives in an Oakland apartment + +test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review +test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review +test_url: http://www.theverge.com/2013/2/24/4026114/barnes-noble-shifting-focus-away-from-nook-hardware +test_url: http://www.theverge.com/2014/6/19/5824072/top-shelf-living-the-dream +test_url: http://www.theverge.com/rss/frontpage diff --git a/data/GrabberConfig/theweek.com.txt b/data/GrabberConfig/theweek.com.txt new file mode 100644 index 00000000..4bc95323 --- /dev/null +++ b/data/GrabberConfig/theweek.com.txt @@ -0,0 +1,8 @@ +body: //div[@class="article-body underline"] + +strip_id_or_class: appendedAds +strip_id_or_class: right-ad-container +strip_id_or_class: ad_wrap + +test_url: http://theweek.com/article/index/215763/insider-trading-on-capitol-hill +test_url: https://theweek.com/articles/642979/why-poor-whites-flock-donald-trump diff --git a/data/GrabberConfig/thinkprogress.org.txt b/data/GrabberConfig/thinkprogress.org.txt new file mode 100644 index 00000000..1eec4e3c --- /dev/null +++ b/data/GrabberConfig/thinkprogress.org.txt @@ -0,0 +1,4 @@ +author: //p[@class="byline"]/a +body: //div[@class="post"] + +test_url: http://thinkprogress.org/special/2011/11/12/367040/harvard-law-professor-criticizes-homeland-security-feel-of-overreaction-to-occupy-harvard/
\ No newline at end of file diff --git a/data/GrabberConfig/thisamericanlife.org.txt b/data/GrabberConfig/thisamericanlife.org.txt new file mode 100644 index 00000000..df7f81cd --- /dev/null +++ b/data/GrabberConfig/thisamericanlife.org.txt @@ -0,0 +1,4 @@ +body: //div[@id='content'] + +test_url: https://www.thisamericanlife.org/282/transcript +test_contains: I was part of sending an innocent man to jail diff --git a/data/GrabberConfig/thisiscolossal.com.txt b/data/GrabberConfig/thisiscolossal.com.txt new file mode 100644 index 00000000..ab16ce18 --- /dev/null +++ b/data/GrabberConfig/thisiscolossal.com.txt @@ -0,0 +1,25 @@ +# Author: zinnober + +tidy: no +prune: no + +# Set author +author: //a[contains(@rel, 'author')] + +# Content is here +body: //article + +# Tidy up before article +strip: //header + +# Get rid of doubled images +strip: //img[contains(@class, '-hidden')] + +# Tidy up after article +strip_id_or_class: social-list +strip_id_or_class: meta-info +strip: //footer + +# Try it yourself +test_url: http://www.thisiscolossal.com/2014/09/chicago-in-the-fog-by-michael-salisbury/ +test_url: http://www.thisiscolossal.com/2014/09/bird-portraits-ruffling-with-personality-by-leila-jeffreys/ diff --git a/data/GrabberConfig/threatpost.com.txt b/data/GrabberConfig/threatpost.com.txt new file mode 100644 index 00000000..29983bd7 --- /dev/null +++ b/data/GrabberConfig/threatpost.com.txt @@ -0,0 +1,6 @@ +title: //header/h1[contains(concat(' ',normalize-space(@class),' '), ' c-article__title ')] +body: //div[contains(concat(' ',normalize-space(@class),' '), ' c-article__main ')] + +strip_id_or_class: c-article__footer + +test_url: https://threatpost.com/lock-screen-bypass-bug-quietly-patched-in-handsets/139141/ diff --git a/data/GrabberConfig/tidbits.com.txt b/data/GrabberConfig/tidbits.com.txt new file mode 100644 index 00000000..1950e58e --- /dev/null +++ b/data/GrabberConfig/tidbits.com.txt @@ -0,0 +1,3 @@ +author: //span[@class='fn'] +date: substring-before(substring-after(//*[@id='center_ajax_sub']/div/div[3],'|'),'|') +test_url: http://tidbits.com/article/12651
\ No newline at end of file diff --git a/data/GrabberConfig/time.com.txt b/data/GrabberConfig/time.com.txt new file mode 100644 index 00000000..8d5616e9 --- /dev/null +++ b/data/GrabberConfig/time.com.txt @@ -0,0 +1,15 @@ +title: //h1[contains(@class, 'article-title')] +author: //article//span[contains(@class, 'byline')] +date: //time[@pubdate]/@datetime +body: //section[contains(@class, 'article-body')] +prune: no +tidy: no + +#Skip GDPR tracking wall +http_header(Cookie): euConsent=true + +strip: //figcaption +strip: //p[contains(., 'MORE:') and ./a] +strip: //aside + +test_url: http://time.com/14478/emotions-may-not-be-so-universal-after-all/ diff --git a/data/GrabberConfig/timeshighereducation.co.uk.txt b/data/GrabberConfig/timeshighereducation.co.uk.txt new file mode 100644 index 00000000..af1c23ce --- /dev/null +++ b/data/GrabberConfig/timeshighereducation.co.uk.txt @@ -0,0 +1,6 @@ +title: //h1 +body: //div[@class="storytext"] +strip: //div[@id="thelogin"] +strip: //*[@class="hide"] +strip: //div[@id="anchored"] +test_url: http://www.timeshighereducation.co.uk/story.asp?sectioncode=26&storycode=416124&c=1
\ No newline at end of file diff --git a/data/GrabberConfig/timeshighereducation.com.txt b/data/GrabberConfig/timeshighereducation.com.txt new file mode 100644 index 00000000..ba52788d --- /dev/null +++ b/data/GrabberConfig/timeshighereducation.com.txt @@ -0,0 +1,3 @@ +body: //div[@class="col-md-12 radix-layouts-contentheader panel-panel"] +strip: //div[@class="htmlContent subscribe_box"] +test_url: https://www.timeshighereducation.com/blog/jeremy-corbyn-serious-about-free-higher-education diff --git a/data/GrabberConfig/tipb.com.txt b/data/GrabberConfig/tipb.com.txt new file mode 100644 index 00000000..b8474d97 --- /dev/null +++ b/data/GrabberConfig/tipb.com.txt @@ -0,0 +1,9 @@ +body: //div[@id='content'] + +strip_id_or_class: featured-box +strip_id_or_class: postmeta +strip_id_or_class: respond + +author: //a[contains(@href, '/author/') and contains(@title, 'Posts by')] +date: substring-before(//a[contains(@href, '/author/') and contains(@title, 'Posts by')]/.., ' by ') +test_url: http://www.tipb.com/2011/10/17/iphone-4s-review/
\ No newline at end of file diff --git a/data/GrabberConfig/titanic-magazin.de.txt b/data/GrabberConfig/titanic-magazin.de.txt new file mode 100644 index 00000000..70108e36 --- /dev/null +++ b/data/GrabberConfig/titanic-magazin.de.txt @@ -0,0 +1,8 @@ +body: //div[contains(@class, 'tt_news-bodytext')] + +# cut html short +find_string: <!--TYPO3SEARCH_end--> +replace_string: </div></body></html> + +test_url: http://www.titanic-magazin.de/ich.war.bei.der.waffen.rss +test_url: http://www.titanic-magazin.de/news/wenig-bekannte-fakten-ueber-2014-6986/
\ No newline at end of file diff --git a/data/GrabberConfig/tldp.org.txt b/data/GrabberConfig/tldp.org.txt new file mode 100644 index 00000000..7dd5cdb5 --- /dev/null +++ b/data/GrabberConfig/tldp.org.txt @@ -0,0 +1,6 @@ +title: //title +body: //h2 | //p | //ul +prune: no +tidy: no + +test_url: http://www.tldp.org/HOWTO/Plug-and-Play-HOWTO-7.html
\ No newline at end of file diff --git a/data/GrabberConfig/tnr.com.txt b/data/GrabberConfig/tnr.com.txt new file mode 100644 index 00000000..199f5d13 --- /dev/null +++ b/data/GrabberConfig/tnr.com.txt @@ -0,0 +1,17 @@ +title: //div[contains(@class, 'article_detail')]/div[@class='entry_header']/h1 +title: //div[contains(@class, 'article_detail')]//h1 +title: //h1 + +body: //div[contains(@class, 'article_detail')] + +author: //div[@class='article_detail']/div[@class='entry_header']/li/div[@class='author']//h3 +author: div[@class='author']//h3 +strip: //div[contains(@class, 'field-field-book-cover')] + +date: translate(//*[@class='post_date' and contains(., ' 20')], '|', '') + +prune: no + +single_page_link: //a[@class='print-page'] + +test_url: http://www.tnr.com/blog/jonathan-chait/92991/did-obama-get-rolled
\ No newline at end of file diff --git a/data/GrabberConfig/tofugu.com.txt b/data/GrabberConfig/tofugu.com.txt new file mode 100644 index 00000000..5ac9d6a0 --- /dev/null +++ b/data/GrabberConfig/tofugu.com.txt @@ -0,0 +1,3 @@ +body://div[@class='entry-content'] + +test_url: http://www.tofugu.com/2015/07/20/interview-with-toriena-japanese-chiptune/ diff --git a/data/GrabberConfig/tomdispatch.com.txt b/data/GrabberConfig/tomdispatch.com.txt new file mode 100644 index 00000000..701a2122 --- /dev/null +++ b/data/GrabberConfig/tomdispatch.com.txt @@ -0,0 +1,6 @@ +title: //div[@id='maincontent']//div[@class='title'] +body: //div[@id='maincontent']//div[@class='byline'] | //div[@id='maincontent']//div[@class='meat'] + +tidy: no + +test_url: http://www.tomdispatch.com/post/175436/tomgram:_noam_chomsky%2C_the_imperial_mentality_and_9_11/
\ No newline at end of file diff --git a/data/GrabberConfig/tomshardware.com.txt b/data/GrabberConfig/tomshardware.com.txt new file mode 100644 index 00000000..5f8cbdd0 --- /dev/null +++ b/data/GrabberConfig/tomshardware.com.txt @@ -0,0 +1,8 @@ +tidy: no +title: //title +author: //a[@itemprop = 'author'] +date: //time[@itemprop = 'datePublished'] +body: //div[@id = 'intelliTXT'] + +next_page_link: //link[@rel='next']/@href +test_url: http://www.tomshardware.com/reviews/gaming-graphics-card-review,3107.html
\ No newline at end of file diff --git a/data/GrabberConfig/tomshardware.de.txt b/data/GrabberConfig/tomshardware.de.txt new file mode 100644 index 00000000..eee57ccf --- /dev/null +++ b/data/GrabberConfig/tomshardware.de.txt @@ -0,0 +1,12 @@ +body://div[@id="news-content"]/div[@id="intelliTXT"][1] + +author://div[@id="header-news-infos"]/a[1] + +date: //div[@id="header-news-infos"]/span[1] + +title://h1[@id="header-news-title" and @class="hardwareTitle"][1] + +strip://div[@id="news-content"]/div[@id="intelliTXT"]/table + +footnotes: no +test_url: http://www.tomshardware.de/DDR4-DDR3-ISSCC-Samsung-Hynix,news-247133.html
\ No newline at end of file diff --git a/data/GrabberConfig/toolinux.com.txt b/data/GrabberConfig/toolinux.com.txt new file mode 100644 index 00000000..3f1a8405 --- /dev/null +++ b/data/GrabberConfig/toolinux.com.txt @@ -0,0 +1,5 @@ +title: //h2[contains(@class,'news')] +body: //div[contains(@class,'articleContent')] +date: substring-after(//div[@class = 'SupaDate']/text(), 'le') + +test_url: http://www.toolinux.com/Wi-Fi-Linksys-WRT-la-legende-de diff --git a/data/GrabberConfig/toolsandtoys.net.txt b/data/GrabberConfig/toolsandtoys.net.txt new file mode 100644 index 00000000..bb45d890 --- /dev/null +++ b/data/GrabberConfig/toolsandtoys.net.txt @@ -0,0 +1,6 @@ +body: //div[@class='post'] + +strip: //div[@class='social'] +strip: //span[@class='next'] +strip: //span[@class='previous'] +test_url: http://toolsandtoys.net/noble-tonic-02/
\ No newline at end of file diff --git a/data/GrabberConfig/tourmag.com.txt b/data/GrabberConfig/tourmag.com.txt new file mode 100644 index 00000000..4e953b44 --- /dev/null +++ b/data/GrabberConfig/tourmag.com.txt @@ -0,0 +1,16 @@ + + +body: //h2[contains(concat(' ',normalize-space(@class),' '),' soustitre ')] | //div[contains(concat(' ',normalize-space(@class),' '),' chapeau ')] | //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] + +author: //meta[@name="author"]/@content + +strip_id_or_class: titre +strip_id_or_class: auteur +strip_id_or_class: boutons +strip_id_or_class: boutons_ligne +strip_id_or_class: module-article_connexe + +find_string: <div class="video-wrapper" +replace_string: [VIDEO]<div class="video-wrapper" + +test_url: https://www.tourmag.com/TUI-France-Cap-sur-Sun-by-TUI-un-eductour-inoubliable_a92065.html diff --git a/data/GrabberConfig/touteduc.fr.txt b/data/GrabberConfig/touteduc.fr.txt new file mode 100644 index 00000000..74d61bbb --- /dev/null +++ b/data/GrabberConfig/touteduc.fr.txt @@ -0,0 +1,18 @@ + +body: //article[@class="alaune"] + +strip_id_or_class: retour +strip_id_or_class: subheader + +test_url: http://www.touteduc.fr/fr/culture/id-9978-l-ife-dresse-un-panorama-des-politiques-de-jeunesse-et-pointe-du-doigt-les-manques-du-systeme-francais + +#----------------------------------------------------------- +# Wallabag-specific login directives (not supported in FTR): +#----------------------------------------------------------- +requires_login: yes +not_logged_in_xpath: //a[text()='Se connecter →'] +login_uri: http://www.touteduc.fr/fr/espace-membre/accueil +login_username_field: login +login_password_field: password +login_extra_fields: connexion-submit=Connexion + diff --git a/data/GrabberConfig/towerofthehand.com.txt b/data/GrabberConfig/towerofthehand.com.txt new file mode 100644 index 00000000..a4d87d12 --- /dev/null +++ b/data/GrabberConfig/towerofthehand.com.txt @@ -0,0 +1,10 @@ +title: //div[@id='headline'] +body: //div[@class='entry_text'] +author: //div[text() = 'Author:']/following-sibling::div/a +date: //div[text() = 'Published:']/following-sibling::div +single_page_link: //a[@href='noscript.html'] +prune: no + +test_url: http://towerofthehand.com/blog/2014/08/08-pitch-this-got-spinoff/index.html +test_url: http://towerofthehand.com/blog/2014/07/31-definitions-and-embodiments/index.html +test_url: http://towerofthehand.com/blog/2014/07/03-hero-with-thousand-faces/index.html diff --git a/data/GrabberConfig/tracks.ranea.org.txt b/data/GrabberConfig/tracks.ranea.org.txt new file mode 100644 index 00000000..5a386470 --- /dev/null +++ b/data/GrabberConfig/tracks.ranea.org.txt @@ -0,0 +1,14 @@ +# Metadata +title: substring-after(//title, 'Coyote Tracks - ') +author: //meta[@name="author"]/@content +date: //div[@class="post_header"]/a + +# Content Pruning +strip: //div[@class="column left"] +strip: //div[@class="pages"] +strip: //a[@class="text_title"] +strip: //ol[@class="notes"] + +dissolve: //div[@class='column right']/ul +dissolve: //li[@class='post'] +test_url: http://tracks.ranea.org/post/31431060205/the-next-big-uh-slightly-taller-thing
\ No newline at end of file diff --git a/data/GrabberConfig/trailer.web-view.net.txt b/data/GrabberConfig/trailer.web-view.net.txt new file mode 100644 index 00000000..e7a9c82d --- /dev/null +++ b/data/GrabberConfig/trailer.web-view.net.txt @@ -0,0 +1,2 @@ +title: concat(substring-before(//title,':'),': ',//div[@class='Date2']) +test_url: http://trailer.web-view.net/Show/0XC4EFE5D648B716BA2E134BC7CE61B9CC001E04F11E9434438186735DBD637488.htm
\ No newline at end of file diff --git a/data/GrabberConfig/trailers.apple.com.txt b/data/GrabberConfig/trailers.apple.com.txt new file mode 100644 index 00000000..556d9522 --- /dev/null +++ b/data/GrabberConfig/trailers.apple.com.txt @@ -0,0 +1,9 @@ +# written by Jan Lukas Gernert + +title: //div[@id='hero']/h1 +author: //meta[@name='Author']/@content +body: //section[@id='gallery-film-info-details'] + +strip: //ul[@id='share'] + +test_url: http://trailers.apple.com/trailers/independent/londonhasfallen/ diff --git a/data/GrabberConfig/trailerzone.de.txt b/data/GrabberConfig/trailerzone.de.txt new file mode 100644 index 00000000..02151a63 --- /dev/null +++ b/data/GrabberConfig/trailerzone.de.txt @@ -0,0 +1,9 @@ +body: //div[@id='video' or @id='main'] + +strip_id_or_class: socialshareprivacy2 +strip_id_or_class: wp_rp_first + +find_string: Genre</strong> +replace_string: </strong></p><p><strong>Genre</strong> + +test_url: http://www.trailerzone.de/g-i-joe-2-die-abrechnung/
\ No newline at end of file diff --git a/data/GrabberConfig/traningslara.se.txt b/data/GrabberConfig/traningslara.se.txt new file mode 100644 index 00000000..d6cfb6db --- /dev/null +++ b/data/GrabberConfig/traningslara.se.txt @@ -0,0 +1,8 @@ +title: //div[@class="Post-body"]//span[@class="PostHeader"] +author: //div[@class="PostHeaderIcons metadata"]/a[@title="Author"] +date: substring-before(//div[@class="PostHeaderIcons metadata"], '|') +body: //div[@class="Post-body"] +strip_id_or_class: print1 +strip_id_or_class: metadata +strip_id_or_class: authorbox +test_url: http://traningslara.se/skoinlagg-och-skador-finns-det-nagot-samband/
\ No newline at end of file diff --git a/data/GrabberConfig/triblive.com.txt b/data/GrabberConfig/triblive.com.txt new file mode 100644 index 00000000..663cafe1 --- /dev/null +++ b/data/GrabberConfig/triblive.com.txt @@ -0,0 +1,13 @@ +title: //title +author: //span/a +date: substring-after(//small,'Published:') + +strip: //h1[@class='vert_class'] +strip: //h1[@class='headline'] +strip: //img[contains(@src,'logo_triblive.gif')] + +#strip: //h6 +#strip_img_src: logo_triblive.gif + +single_page_link: //a[@class='stprint'] +test_url: http://triblive.com/sports/2819913-85/lemieux-deal-penguins-burkle-nhl-owners-team-mario-bettman-case
\ No newline at end of file diff --git a/data/GrabberConfig/troyhunt.com.txt b/data/GrabberConfig/troyhunt.com.txt new file mode 100644 index 00000000..e2a57eae --- /dev/null +++ b/data/GrabberConfig/troyhunt.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.troyhunt.com%2Fthe-effectiveness-of-publicly-shaming-bad-security%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' content-wrapper ')] +test_url: https://www.troyhunt.com/the-effectiveness-of-publicly-shaming-bad-security/
\ No newline at end of file diff --git a/data/GrabberConfig/truthdig.com.txt b/data/GrabberConfig/truthdig.com.txt new file mode 100644 index 00000000..0b34ec4f --- /dev/null +++ b/data/GrabberConfig/truthdig.com.txt @@ -0,0 +1,18 @@ +title: //div[@class='printbody']/h1 +body: //div[@class='printbody'] +# date: //meta[@property="article:published_time"]/@content +prune: no + +strip: //div[@class='printbody']//a[@href='http://www.truthdig.com/'] +strip: //table[@class='footer'] +strip: //h6[contains(., 'http://')] + +strip_id_or_class: masthead +strip_id_or_class: addthis +strip_id_or_class: article_tools + +single_page_link: //a[contains(@href, '/print/')] + +test_url: http://www.truthdig.com/report/item/the_election_march_of_the_trolls_20110829/ +test_contains: Most important, we must stop being afraid. +test_url: http://www.truthdig.com/dig/item/the_death_of_truth_20130505/ diff --git a/data/GrabberConfig/tthfanfic.org.txt b/data/GrabberConfig/tthfanfic.org.txt new file mode 100644 index 00000000..63537c10 --- /dev/null +++ b/data/GrabberConfig/tthfanfic.org.txt @@ -0,0 +1,4 @@ +title: //h2 +author: //a[starts-with(@href, '/AuthorStories')] +body: //div[@id='storyinnerbody'] +test_url: http://www.tthfanfic.org/Story-6512/Kudra+Journeys.htm
\ No newline at end of file diff --git a/data/GrabberConfig/tuaw.com.txt b/data/GrabberConfig/tuaw.com.txt new file mode 100644 index 00000000..2af00c27 --- /dev/null +++ b/data/GrabberConfig/tuaw.com.txt @@ -0,0 +1,6 @@ +title: //h1[@class='posttitle'] +author: //span[@class='author']/a +date: //span[@class='timestamp'] +body: //div[@class='body'] + +test_url: http://www.tuaw.com/2011/10/19/apple-posts-fans-memories-of-steve-jobs/
\ No newline at end of file diff --git a/data/GrabberConfig/tuhdo.github.io.txt b/data/GrabberConfig/tuhdo.github.io.txt new file mode 100644 index 00000000..beb551fd --- /dev/null +++ b/data/GrabberConfig/tuhdo.github.io.txt @@ -0,0 +1,7 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Ftuhdo.github.io%2Femacs-tutor.html + +body: //div[@id='content'] +strip_id_or_class: table-of-contents +test_url: https://tuhdo.github.io/emacs-tutor.html diff --git a/data/GrabberConfig/turnoff.us.txt b/data/GrabberConfig/turnoff.us.txt new file mode 100644 index 00000000..f90ba552 --- /dev/null +++ b/data/GrabberConfig/turnoff.us.txt @@ -0,0 +1,3 @@ +title: //h1[@class='post-title'] +body: //article[@class='post-content'] +test_url: https://turnoff.us/geek/the-depressed-developer-13/ diff --git a/data/GrabberConfig/tvtropes.org.txt b/data/GrabberConfig/tvtropes.org.txt new file mode 100644 index 00000000..3cc3a9cf --- /dev/null +++ b/data/GrabberConfig/tvtropes.org.txt @@ -0,0 +1,20 @@ +# Google Custom Search +strip_id_or_class: google_branding_style + +# Avoid double title +strip_id_or_class: pagetitle + +# external links are labelled +strip_image_src: http://static.mediatropes.info/pmwiki/pub/external_link.gif + +title: //div[@class="pagetitle"] +body: //div[@id="wikitext"] + +# don't get clever. +strip_comments: no +prune: no + +# navigation in footer lives inside the wikitext div, annoyingly. +strip_id_or_class: pathholder + +test_url: http://tvtropes.org/pmwiki/pmwiki.php/Main/WithinParameters
\ No newline at end of file diff --git a/data/GrabberConfig/tweakers.net.txt b/data/GrabberConfig/tweakers.net.txt new file mode 100644 index 00000000..fd2cfa1d --- /dev/null +++ b/data/GrabberConfig/tweakers.net.txt @@ -0,0 +1,13 @@ +body: //div[@class='articleContent'] +single_page_link: //a[contains(., 'Singlepage layout')] + +strip: //q[@class='streamer'] +strip_id_or_class: notificationsContainer +prune: no + +http_header(user-agent): curl/7.54.0 + +test_url: http://tweakers.net/feeds/mixed.xml +test_url: https://tweakers.net/reviews/4110/tv-kijken-zonder-kabels-vodafone-tv-anywhere-en-kpn-play.html +test_url: https://tweakers.net/reviews/4113/all/hp-spectre-13-x360-convertible-die-vrijwel-alles-goed-doet.html +test_url: https://tweakers.net/nieuws/142245/amerikaanse-openbaar-aanklager-eist-dat-facebook-encryptie-messenger-verwijdert.html diff --git a/data/GrabberConfig/twitter.com.txt b/data/GrabberConfig/twitter.com.txt new file mode 100644 index 00000000..0a010a45 --- /dev/null +++ b/data/GrabberConfig/twitter.com.txt @@ -0,0 +1,16 @@ +title: //title +body: (//p[contains(@class, 'js-tweet-text')])[1] +author: (//strong[contains(@class, 'fullname')])[1] +date: //span[contains(@class, 'js-short-timestamp')]/@data-time + +# mobile site (automatic redirect - noscript meta refresh) +author: (//div[contains(@class, 'fullname')])[1] +body: (//div[contains(@class, 'TweetDetail-text')])[1] + +parser: html5php + +prune: no +tidy: no + +test_url: https://twitter.com/medialens/status/216883678582804480 +test_contains: is all but alone in challenging the tsunami of UK diff --git a/data/GrabberConfig/ubuntugeek.com.txt b/data/GrabberConfig/ubuntugeek.com.txt new file mode 100644 index 00000000..9c159b25 --- /dev/null +++ b/data/GrabberConfig/ubuntugeek.com.txt @@ -0,0 +1,9 @@ +title: //header[@class='entry-header']/h1[@class='entry-title']/a +author: //span[@class="heatmapthemead-post-details"]/span[@class='byline']/span[@class='author vcard']/a +date: //span[@class="heatmapthemead-post-details"]/a/time/@datetime +body: //div[@class="post-content description"] + +strip: //div[@id="essb_links essb_counters essb_displayed_bottom essb_share essb_template_fancy-retina essb_1179534949 essb_links_right print-no"] +strip: //div[@class="tagcloud"] + +test_url: http://www.ubuntugeek.com/install-glpi-it-and-asset-managemet-software-on-ubuntu-16-04-server.html diff --git a/data/GrabberConfig/uefa.com.txt b/data/GrabberConfig/uefa.com.txt new file mode 100644 index 00000000..3469be03 --- /dev/null +++ b/data/GrabberConfig/uefa.com.txt @@ -0,0 +1,6 @@ +body: //div[@class='d3cmsCBody']//div[@class='pubText pubDate' or @class='newsComment' or contains(@class, 'newsPhoto') or @class='newsText'] +strip: //div[contains(@class, 'mpindex')] +prune: no +tidy: no + +test_url: http://www.uefa.com/uefaeuropaleague/news/newsid=1617320.html
\ No newline at end of file diff --git a/data/GrabberConfig/uk.xbox360.ign.com.txt b/data/GrabberConfig/uk.xbox360.ign.com.txt new file mode 100644 index 00000000..cd9c1361 --- /dev/null +++ b/data/GrabberConfig/uk.xbox360.ign.com.txt @@ -0,0 +1,23 @@ +# applies to uk.ds.ign.com, uk.wii.ign.com etc. +# possibly to non-UK versions, but I can’t test that + +title: //h1[@class="headline"] +author: //div[@class="hdr-sub byline"]/a +date: //h2[@class="publish-date"]/span +body: //div[@id="main-article-content"] + +strip: //ul[@class="lnks-readmore"] + +strip: //div[@class="inlineImageCaption"] +# can’t make the images appear, so remove the captions + +strip: //div[@style="width:468px"] +# video caption links + +convert_double_br_tags: yes + +strip_comments: no +# otherwise the ‘Closing Comments’ are removed + +# Ratings box could do with some rearranging, but it’s tricky +test_url: http://uk.xbox360.ign.com/articles/121/1210717p1.html
\ No newline at end of file diff --git a/data/GrabberConfig/uni-watch.com.txt b/data/GrabberConfig/uni-watch.com.txt new file mode 100644 index 00000000..03b281a4 --- /dev/null +++ b/data/GrabberConfig/uni-watch.com.txt @@ -0,0 +1,16 @@ +author: substring-before(substring-after(//div[@class='post-byline'], 'By '), ', on') +date: substring-after(//div[@class='post-byline'], ', on') + +# for some reason, the following is producing a "no text [48]" error +#title: //div[@class='post-headline'] + +body: //div[contains(@class, 'post-bodycopy')] + +# we solve the above issue by stripping out everything else we don't want +# these can probably all be removed if the body: command above worked +strip_id_or_class: reply +strip_id_or_class: left +strip_id_or_class: post-headline +strip_id_or_class: post-byline +strip_id_or_class: footer +test_url: http://www.uni-watch.com/2011/10/18/the-curious-case-of-steve-debergs-microphone-and-speaker/ diff --git a/data/GrabberConfig/unwinnable.com.txt b/data/GrabberConfig/unwinnable.com.txt new file mode 100644 index 00000000..05ad86a5 --- /dev/null +++ b/data/GrabberConfig/unwinnable.com.txt @@ -0,0 +1,9 @@ +title: //h1[@class='postTitle'] +author: //a[@rel='author'] +date: substring-before(//h4[@class='postAuthor'], '|') +body: //div[@class='postContent'] + +strip: //div[@class='simplePullQuote'] + +wrap_in(figure): //img +test_url: http://www.unwinnable.com/2013/04/23/gratifying-play/
\ No newline at end of file diff --git a/data/GrabberConfig/uppsalafria.se.txt b/data/GrabberConfig/uppsalafria.se.txt new file mode 100644 index 00000000..79c59ece --- /dev/null +++ b/data/GrabberConfig/uppsalafria.se.txt @@ -0,0 +1,7 @@ +body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] +author: //article//div[contains(@class, 'field-byline')] +strip_id_or_class: rekommenderade +strip_id_or_class: disqus +strip_id_or_class: annonser + +test_url: http://www.uppsalafria.se/artikel/97167
\ No newline at end of file diff --git a/data/GrabberConfig/urbandictionary.com.txt b/data/GrabberConfig/urbandictionary.com.txt new file mode 100644 index 00000000..385c95ca --- /dev/null +++ b/data/GrabberConfig/urbandictionary.com.txt @@ -0,0 +1,3 @@ +title: //title +body: //table[@id='entries'] +test_url: http://www.urbandictionary.com/define.php?term=Grown-Ass diff --git a/data/GrabberConfig/us-cert.gov.txt b/data/GrabberConfig/us-cert.gov.txt new file mode 100644 index 00000000..35ea19bc --- /dev/null +++ b/data/GrabberConfig/us-cert.gov.txt @@ -0,0 +1,9 @@ +# Page title +title: //h1[@id='page-title'] +# Page subtitle +title: //h2[@id='page-sub-title'] +# Page metadata +date: //footer[contains(concat(' ',normalize-space(@class),' '),' submitted ')] +# Page body +body: //article[contains(concat(' ',normalize-space(@class),' '),' node ')]//div[contains(concat(' ',normalize-space(@class),' '),' content ') and (contains(concat(' ',normalize-space(@class),' '),' clearfix '))] +test_url: https://www.us-cert.gov/ncas/alerts/TA17-181A
\ No newline at end of file diff --git a/data/GrabberConfig/usatoday.com.txt b/data/GrabberConfig/usatoday.com.txt new file mode 100644 index 00000000..d01782eb --- /dev/null +++ b/data/GrabberConfig/usatoday.com.txt @@ -0,0 +1,8 @@ +date: //meta[@itemprop="datePublished"]/@content +author: //div[@itemprop="author"] +body: //div[contains(@itemprop, 'articleBody')] + +strip_id_or_class: share-tools + +test_url: https://www.usatoday.com/story/news/world/2014/03/18/malaysia-plane-search/6552429/ +test_url: http://rssfeeds.usatoday.com/usatoday-NewsTopStories diff --git a/data/GrabberConfig/usccb.org.txt b/data/GrabberConfig/usccb.org.txt new file mode 100644 index 00000000..30c28823 --- /dev/null +++ b/data/GrabberConfig/usccb.org.txt @@ -0,0 +1,6 @@ +body: //div[@id='CS_Element_maincontent'] + +tidy: no +prune: no + +test_url: http://www.usccb.org/bible/readings/072412.cfm
\ No newline at end of file diff --git a/data/GrabberConfig/useit.com.txt b/data/GrabberConfig/useit.com.txt new file mode 100644 index 00000000..b8511c7c --- /dev/null +++ b/data/GrabberConfig/useit.com.txt @@ -0,0 +1,8 @@ +title: //h1 + +date: substring-after(//p[@class='overline']/strong, ',') +body: //div[@class="maintext"] +strip: //p[@class='overline'] +strip: //h1 +tidy: no +test_url: http://www.useit.com/alertbox/mobile-startup-screen.html
\ No newline at end of file diff --git a/data/GrabberConfig/usfirst.org.txt b/data/GrabberConfig/usfirst.org.txt new file mode 100644 index 00000000..f02b2d3e --- /dev/null +++ b/data/GrabberConfig/usfirst.org.txt @@ -0,0 +1,6 @@ +title: //meta[@property='dc:title']/@content +date: //div[@class='content']//span[@property='dc:date']/@content +body: //div[@property='content:encoded'] +prune: no + +test_url: http://www.usfirst.org/roboticsprograms/frc/Photo-From-Kickoff-Filming
\ No newline at end of file diff --git a/data/GrabberConfig/utdailybeacon.com.txt b/data/GrabberConfig/utdailybeacon.com.txt new file mode 100644 index 00000000..c4593d55 --- /dev/null +++ b/data/GrabberConfig/utdailybeacon.com.txt @@ -0,0 +1,2 @@ +body: //div[@id='blox-story-text'] +test_url: http://www.utdailybeacon.com/news/article_ccf6d024-0f15-11e5-ae29-9f63598deb81.html diff --git a/data/GrabberConfig/utux.fr.txt b/data/GrabberConfig/utux.fr.txt new file mode 100644 index 00000000..e25ec470 --- /dev/null +++ b/data/GrabberConfig/utux.fr.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Futux.fr%2Findex.php%3Farticle166%2Fle-libre-c-est-bien + +body: //article[contains(concat(' ',normalize-space(@class),' '),' article ')] +test_url: https://utux.fr/index.php?article166/le-libre-c-est-bien
\ No newline at end of file diff --git a/data/GrabberConfig/ux.artu.tv.txt b/data/GrabberConfig/ux.artu.tv.txt new file mode 100644 index 00000000..c69f2df9 --- /dev/null +++ b/data/GrabberConfig/ux.artu.tv.txt @@ -0,0 +1,7 @@ +author: ("Arturo Toledo") +title: //div[@class="post"]/h2 +body: //div[@class="entry"] + +# Remove Twitter button +strip: //div[@class="entry"]/p[2]/a/img +test_url: http://ux.artu.tv/?p=192
\ No newline at end of file diff --git a/data/GrabberConfig/vanityfair.com.txt b/data/GrabberConfig/vanityfair.com.txt new file mode 100644 index 00000000..e340ed30 --- /dev/null +++ b/data/GrabberConfig/vanityfair.com.txt @@ -0,0 +1,10 @@ +body: //div[(contains(concat(' ',normalize-space(@class),' '),' article-content-body '))] + +// we can't load this (JS load) so may as well strip it to remove byline +strip: //figure[contains(@class, 'main-image')] + +test_url: http://www.vanityfair.com/news/2016/02/george-w-bush-donald-trump-iraq +test_contains: riding high off a fourth-place finish in New Hampshire + +test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808 +test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201 diff --git a/data/GrabberConfig/variety.com.txt b/data/GrabberConfig/variety.com.txt new file mode 100644 index 00000000..c7f4458f --- /dev/null +++ b/data/GrabberConfig/variety.com.txt @@ -0,0 +1,11 @@ +body: //div[class='article-body'] + +strip_id_or_class: tags +strip_id_or_class: ad-below-tags +strip_id_or_class: comments-overview +strip_id_or_class: widget +strip_id_or_class: featured-jobs-template +strip_id_or_class: article-river +strip_id_or_class: article-comments + +test_url: http://variety.com/2016/film/news/jake-gyllenhaal-carey-mulligan-wildlife-1201869072/ diff --git a/data/GrabberConfig/varsity.co.uk.txt b/data/GrabberConfig/varsity.co.uk.txt new file mode 100644 index 00000000..dfbf69cf --- /dev/null +++ b/data/GrabberConfig/varsity.co.uk.txt @@ -0,0 +1,4 @@ +# FB comments are inside an h2. Weird. Without this, the line 'Comments' is preserved by the text parser + +strip: //h2 +test_url: http://www.varsity.co.uk/reviews/2662
\ No newline at end of file diff --git a/data/GrabberConfig/vea.gov.vn.txt b/data/GrabberConfig/vea.gov.vn.txt new file mode 100644 index 00000000..9c8420ce --- /dev/null +++ b/data/GrabberConfig/vea.gov.vn.txt @@ -0,0 +1,7 @@ +title://div[@class="detail-new-title"] +body://div[@class="innerpad"] +strip://div[@class="ArticleUtility"] +strip://div[@class="commentPost"] +strip://div[@class="comment-box"] +strip://div[@id="TinLienQuan"] +test_url: http://vea.gov.vn/vn/tintuc/tintuchangngay/Pages/T%C4%83ng-c%C6%B0%E1%BB%9Dng-b%E1%BA%A3o-t%E1%BB%93n-%C4%91%E1%BB%99ng-v%E1%BA%ADt-hoang-d%C3%A3-%E1%BB%9F-Vi%E1%BB%87t-Nam.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/vedomosti.ru.txt b/data/GrabberConfig/vedomosti.ru.txt new file mode 100644 index 00000000..265f9fc7 --- /dev/null +++ b/data/GrabberConfig/vedomosti.ru.txt @@ -0,0 +1,3 @@ +title: //td[@class='second_content']/h1 +body: //td[@class='second_content']/div[@class='article_text'] +test_url: http://www.vedomosti.ru/newspaper/article/259377/rasprodazha_mailru
\ No newline at end of file diff --git a/data/GrabberConfig/velomotion.de.txt b/data/GrabberConfig/velomotion.de.txt new file mode 100644 index 00000000..2ceb13ce --- /dev/null +++ b/data/GrabberConfig/velomotion.de.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fvelomotion.de%2F2017%2F07%2Fgocycle-g3-e-bike-kompakt%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ') and (contains(concat(' ',normalize-space(@class),' '),' clearfix '))] +test_url: http://velomotion.de/2017/07/gocycle-g3-e-bike-kompakt/ diff --git a/data/GrabberConfig/venturebeat.com.txt b/data/GrabberConfig/venturebeat.com.txt new file mode 100644 index 00000000..d6321d79 --- /dev/null +++ b/data/GrabberConfig/venturebeat.com.txt @@ -0,0 +1,6 @@ +title: //h1[@class="entry-title"] +author: //div[@class="author-name"] +date: //span[@class="the-time"] +body: //div[@class="entry-content"] +strip: //div[@class="vb-gallery"] +test_url: http://venturebeat.com/2012/07/17/marissa-mayer-yahoo/#s:mayer-1
\ No newline at end of file diff --git a/data/GrabberConfig/version2.dk.txt b/data/GrabberConfig/version2.dk.txt new file mode 100644 index 00000000..ad1c88c6 --- /dev/null +++ b/data/GrabberConfig/version2.dk.txt @@ -0,0 +1,12 @@ +title: //article/header/h1 + +author: //article/header/section[@class='byline']/span[contains(@class, 'author')]/a +date: //article/header/section[@class='byline']/span[@class='published']/span + +body: //article/section[@class='body'] + +convert_double_br_tags: yes + +# This is required, because Tidy chokes on the HTML5 tags... +tidy: no +test_url: https://www.version2.dk/artikel/amerikansk-hit-investor-er-vild-med-danske-net-ivaerksaettere-17069 diff --git a/data/GrabberConfig/verybestbaking.com.txt b/data/GrabberConfig/verybestbaking.com.txt new file mode 100644 index 00000000..ad0fec66 --- /dev/null +++ b/data/GrabberConfig/verybestbaking.com.txt @@ -0,0 +1,7 @@ +title: //title +body: //div[contains(@class, 'printRecipe')] +strip: //div[@class='recipeHeader'] +prune: no +tidy: no +single_page_link: //ul[@class='printOptions']//a[contains(@href, 'detail.aspx?p=1&showphoto=true')] +test_url: http://www.verybestbaking.com/recipes/143190/Penne-Pasta-with-Sun-dried-Tomato-Cream-Sauce/detail.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/vg.no.txt b/data/GrabberConfig/vg.no.txt new file mode 100644 index 00000000..bfadb4a7 --- /dev/null +++ b/data/GrabberConfig/vg.no.txt @@ -0,0 +1,3 @@ +body: //div[@id='artikkelspalte'] +strip_id_or_class: 'breadcrumb' +test_url: http://www.vg.no/spill/artikkel.php?artid=10003628
\ No newline at end of file diff --git a/data/GrabberConfig/viaoccitanie.tv.txt b/data/GrabberConfig/viaoccitanie.tv.txt new file mode 100644 index 00000000..5348e94b --- /dev/null +++ b/data/GrabberConfig/viaoccitanie.tv.txt @@ -0,0 +1,19 @@ + +author: //meta[@name='author']/@content +body: //div[@id='video-news'] | //div[@class="col-sm-9"]/div[@class='text'] + +# --------------------------------- +# replace <iframe> video by a link: +# --------------------------------- +find_string: <iframe +replace_string: <a +find_string: data-src="https:// +replace_string: href="https:// +find_string: ></iframe> +replace_string: >[video]</a> + +strip: //div[@class='related-title']/following-sibling::* +strip: //div[@class='related-title'] +strip_id_or_class: left-sidebar + +test_url: https://viaoccitanie.tv/lartiste-de-plein-vent-michel-batlle-expose-ses-oeuvres-au-chateau-de-lareole/ diff --git a/data/GrabberConfig/video.forbes.com.txt b/data/GrabberConfig/video.forbes.com.txt new file mode 100644 index 00000000..5db77463 --- /dev/null +++ b/data/GrabberConfig/video.forbes.com.txt @@ -0,0 +1,9 @@ +title: concat("Video: ", //div[@id='currentVideoTitleDivId']) +body: //div[@id='currentVideoDescriptionId'] +author: //meta[@name='author']/@content + +replace_string(<div id="currentVideoDescriptionId" style="display): <div id="currentVideoDescriptionId" style="displayitplease + +replace_string(<div id="currentVideoTitleDivId" style="display): <div id="currentVideoTitleDivId" style="displayitplease + +test_url: http://video.forbes.com/fvn/business/wells-fargo-inside-the-bank-that-works
\ No newline at end of file diff --git a/data/GrabberConfig/videogum.com.txt b/data/GrabberConfig/videogum.com.txt new file mode 100644 index 00000000..d93780ca --- /dev/null +++ b/data/GrabberConfig/videogum.com.txt @@ -0,0 +1,6 @@ +title: //h2[@class='posttitle'] +date: substring-before(substring-after(//span[@class='postdate'], 'on '), ' by') +date: //span[@class='postdate'] +author: //span[@class='postdate']/a +body: //div[@class='entry line_top'] +test_url: http://videogum.com/395042/here-are-some-afternoon-links-92/list/
\ No newline at end of file diff --git a/data/GrabberConfig/vimeo.com.txt b/data/GrabberConfig/vimeo.com.txt new file mode 100644 index 00000000..f36c9c57 --- /dev/null +++ b/data/GrabberConfig/vimeo.com.txt @@ -0,0 +1,17 @@ +title: //title +body: //iframe + +find_string: <html><iframe +replace_string: <iframe id="video" + +find_string: ></iframe></html> +replace_string: ></iframe> + +replace_string("): " + +single_page_link: //link[@type='text/xml+oembed'] + +prune: no +tidy: no + +test_url: http://vimeo.com/35941909
\ No newline at end of file diff --git a/data/GrabberConfig/viply.de.txt b/data/GrabberConfig/viply.de.txt new file mode 100644 index 00000000..e3599c9d --- /dev/null +++ b/data/GrabberConfig/viply.de.txt @@ -0,0 +1,12 @@ +title: //div[@id='singletext']//h1 +body: //div[contains(@class, 'mypictureborder')] | //div[@id='singletext'] +prune: no + +strip_id_or_class: singletostart +strip_id_or_class: navigation +strip_id_or_class: social +strip_id_or_class: single_topwrapper +strip: //a[contains(., 'Nächster Artikel')] + +test_url: http://www.viply.de/?p=87973 +test_url: http://www.viply.de/?feed=rss2
\ No newline at end of file diff --git a/data/GrabberConfig/visir.is.txt b/data/GrabberConfig/visir.is.txt new file mode 100644 index 00000000..04e09102 --- /dev/null +++ b/data/GrabberConfig/visir.is.txt @@ -0,0 +1,14 @@ +# Author's name, when present, has 'skrifar:' ('writes:') appended to it. +# In case of multiple authors, this would be 'skrifa:', hence only 7 characters +# are stripped off. +author: substring(//div[@class='paragraph']/div[@class='meta'], 0, string-length(//div[@class='paragraph']/div[@class='meta']) - 7) + +date: //span[@class='date'] +title: //h1 +body: //div[@class='paragraph'] + +# Strip out author string when present +strip: //div[@class='paragraph']/div[@class='meta'] + +convert_double_br_tags: yes +test_url: http://visir.is/esb,-ipa,-bhm-og-bsrb/article/2012701319997
\ No newline at end of file diff --git a/data/GrabberConfig/vitispr.com.txt b/data/GrabberConfig/vitispr.com.txt new file mode 100644 index 00000000..f2d11c7c --- /dev/null +++ b/data/GrabberConfig/vitispr.com.txt @@ -0,0 +1,6 @@ +strip: //*[(@id = "ja-search")] +body: //*[(@id = "ja-mainbody")] +body: //*[(@id = "content-mass-bottom")] +strip://h3[contains(span,'Related Posts')] +strip://img +test_url: http://vitispr.com/blog/coventry-is-a-technology-hotspot
\ No newline at end of file diff --git a/data/GrabberConfig/vivirmexico.com.txt b/data/GrabberConfig/vivirmexico.com.txt new file mode 100644 index 00000000..e6a72700 --- /dev/null +++ b/data/GrabberConfig/vivirmexico.com.txt @@ -0,0 +1,2 @@ +body: //*[(@class = "historia")] +test_url: http://vivirmexico.com/2011/09/en-veracruz-arrojan-35-cuerpos-a-plena-luz-del-dia-esta-si-es-una-alarma-social
\ No newline at end of file diff --git a/data/GrabberConfig/vnexpress.net.txt b/data/GrabberConfig/vnexpress.net.txt new file mode 100644 index 00000000..e5ebc435 --- /dev/null +++ b/data/GrabberConfig/vnexpress.net.txt @@ -0,0 +1,8 @@ +body: //div[@cpms_content]//h2[@class='Lead'] | //div[@cpms_content]//p[@class='Normal'] | //div[@cpms_content]//table +strip://div[@class="box-item"] +strip://div[@id="ARTICLE_BANNER"] +strip://a +strip://div[@class="tag-parent"] +strip://div[@class="email-print txtr"] + +test_url: http://vnexpress.net/gl/xa-hoi/2011/04/tim-thay-nan-nhan-cuoi-cung-vu-sap-mo-da-o-len-co/
\ No newline at end of file diff --git a/data/GrabberConfig/voices.washingtonpost.com.txt b/data/GrabberConfig/voices.washingtonpost.com.txt new file mode 100644 index 00000000..1a0da2bd --- /dev/null +++ b/data/GrabberConfig/voices.washingtonpost.com.txt @@ -0,0 +1,7 @@ +title: //h1 +body: //div[@class='entrytext'] + +#GDPR cookies +http_header(Cookie): wp_devicetype=0; rplpwabt4=1; devicetype=0; osfam=0; de=; client_region=0; wp_gdpr=1|1; rplmct=1; washpost_poe=true; + +test_url: http://voices.washingtonpost.com/ezra-klein/2010/10/why_isnt_monetary_policy_discr.html diff --git a/data/GrabberConfig/volkskrant.nl.txt b/data/GrabberConfig/volkskrant.nl.txt new file mode 100644 index 00000000..f2f630f4 --- /dev/null +++ b/data/GrabberConfig/volkskrant.nl.txt @@ -0,0 +1,19 @@ +#bypass cookie check +single_page_link: //a[contains(@href, '/cookiewall/accept')] + +title: //h1[@itemprop="headline"] +body: //figure[contains(@class, 'article__top-image')] | //div[@itemprop="articleBody"] + +strip_id_or_class: ad-space + +strip: //div[contains(@class, 'media-container') and contains(@class, 'pull-right')] + +tidy: no +prune: no + +parser: html5php + +test_url: http://www.volkskrant.nl/sport/dossier-wereldvoetbalbond-fifa-wankelt~a4042695/ +test_contains: Het ging om de omstreden + +test_url: http://www.volkskrant.nl/nieuws/rss.xml diff --git a/data/GrabberConfig/voltairenet.org.txt b/data/GrabberConfig/voltairenet.org.txt new file mode 100644 index 00000000..b062bd7d --- /dev/null +++ b/data/GrabberConfig/voltairenet.org.txt @@ -0,0 +1,4 @@ +title: //h1[contains(concat(' ',normalize-space(@class),' '),' titre_serif_1 ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' texte_sans ')] + +test_url: http://www.voltairenet.org/article195149.html
\ No newline at end of file diff --git a/data/GrabberConfig/vox.com.txt b/data/GrabberConfig/vox.com.txt new file mode 100644 index 00000000..f231e429 --- /dev/null +++ b/data/GrabberConfig/vox.com.txt @@ -0,0 +1,8 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder + +title: //h1[contains(concat(' ',normalize-space(@class),' '),' c-page-title ')] +body: //div[contains(concat(' ',normalize-space(@class),' '),' c-entry-content ')] +date: //time[contains(concat(' ',normalize-space(@class),' '),' c-byline__item ')] +strip: //h3 +test_url: http://www.vox.com/policy-and-politics/2016/11/28/13728086/trump-literally-and-seriously diff --git a/data/GrabberConfig/voxeurop.eu.txt b/data/GrabberConfig/voxeurop.eu.txt new file mode 100644 index 00000000..1fc4557f --- /dev/null +++ b/data/GrabberConfig/voxeurop.eu.txt @@ -0,0 +1,15 @@ + +body: //article//div[contains(concat(' ',normalize-space(@class),' '),' bodytext ')] + +author: //div[contains(concat(' ',normalize-space(@class),' '),' contentauthor ')]//a[@rel='author'] + +prune: no + +strip_id_or_class: ctafeedback +strip_id_or_class: toolssocial +strip_id_or_class: articlepartner + +#Strip the paragraph about the translator: +strip: //p[@class='floatr ruledtop']/strong/parent::p + +test_url: https://voxeurop.eu/en/2018/migrant-crisis-5122095 diff --git a/data/GrabberConfig/vr-zone.com.txt b/data/GrabberConfig/vr-zone.com.txt new file mode 100644 index 00000000..3d773f41 --- /dev/null +++ b/data/GrabberConfig/vr-zone.com.txt @@ -0,0 +1,14 @@ +tidy: no +prune: no + +title: //title +date: //time + +body: //div[@class='post-content entry-content'] + +strip: //a[@id='dd_start'] +strip: //iframe +strip: //div[@class='dd_outer'] +strip: //comments + +test_url: http://vr-zone.com/articles/amd-take-intel-8-core-fm3-socket-zen-desktop-cpus-next-year/91057.html diff --git a/data/GrabberConfig/vulture.com.txt b/data/GrabberConfig/vulture.com.txt new file mode 100644 index 00000000..e5fafdfd --- /dev/null +++ b/data/GrabberConfig/vulture.com.txt @@ -0,0 +1,18 @@ +#copied from nymag.com.txt + +title: //h2[contains(@class, 'primary')] +body: //*[@itemprop="articleBody"] +body: //div[@id='story'] +author: //*[@class='by']/a +date: substring-after(//*[@class='date'], 'Published') + +#Skip GDPR warning +http_header(Cookie): nymuc=11111111111 + +parser: html5php +tidy: no + +next_page_link: //div[@class='page-navigation']//li[@class='next']/a + +test_url: http://www.vulture.com/2018/06/damsel-review.html +test_contains: after her favorite candy diff --git a/data/GrabberConfig/warnerbros.fr.txt b/data/GrabberConfig/warnerbros.fr.txt new file mode 100644 index 00000000..6215b727 --- /dev/null +++ b/data/GrabberConfig/warnerbros.fr.txt @@ -0,0 +1,2 @@ +body: //div[@class="article-body"] +test_url: https://www.warnerbros.fr/articles/magic-mike-xxl-adam-rodriguez-portrait diff --git a/data/GrabberConfig/warriordudimanche.net.txt b/data/GrabberConfig/warriordudimanche.net.txt new file mode 100644 index 00000000..db7c2494 --- /dev/null +++ b/data/GrabberConfig/warriordudimanche.net.txt @@ -0,0 +1,4 @@ +title: //article[contains(concat(' ',normalize-space(@class),' '),' article ')]//header//h1 +body: //article[contains(concat(' ',normalize-space(@class),' '),' article ')]//section + +test_url: http://warriordudimanche.net/article458/589065212a599
\ No newline at end of file diff --git a/data/GrabberConfig/washingtoninstitute.org.txt b/data/GrabberConfig/washingtoninstitute.org.txt new file mode 100644 index 00000000..17f45677 --- /dev/null +++ b/data/GrabberConfig/washingtoninstitute.org.txt @@ -0,0 +1,6 @@ +body: //div[@class='main']//article + +prune: no + +test_url: http://www.washingtoninstitute.org/policy-analysis/view/striking-syria-lessons-from-the-israeli-experience?goback=.gde_3822158_member_273623672 +test_url: http://www.washingtoninstitute.org/rss/11/10
\ No newline at end of file diff --git a/data/GrabberConfig/washingtonmonthly.com.txt b/data/GrabberConfig/washingtonmonthly.com.txt new file mode 100644 index 00000000..8f8902a5 --- /dev/null +++ b/data/GrabberConfig/washingtonmonthly.com.txt @@ -0,0 +1,10 @@ +title://a[@class = 'headline-article'] + +author: substring-after(//div[@class = 'article']/p[@class = 'author'], 'By ') +date://div[@class = 'article']/span[@class = 'date'] +body://div[@class = 'article'] +single_page_link://a[@class = 'print'] +strip://p[@class = 'author'] +strip://a[@class = 'headline-article'] +strip://span[@class = 'date'] +test_url: http://www.washingtonmonthly.com/magazine/julyaugust_2011/features/the_trinity_sisters030380.php
\ No newline at end of file diff --git a/data/GrabberConfig/washingtonpost.com.txt b/data/GrabberConfig/washingtonpost.com.txt new file mode 100644 index 00000000..daf1d343 --- /dev/null +++ b/data/GrabberConfig/washingtonpost.com.txt @@ -0,0 +1,42 @@ +# Seems to be redirecting to articles.washingtonpost.com for many users + +body: //div[contains(@class, "article_body")] +# print view +body: //div[@id='print_facet']//div[@id='body'] + +author://meta[@name='DC.creator']/@content +author://span[@class="pb-byline"] +author://h3[@property='dc.creator']//a[@rel='author'] +title://meta[@name='title']/@content +date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title +date://meta[@name="DC.date.issued"]/@content +date://span[contains(@class,"pb-timestamp")] +date://meta[@name="eomportal-lastUpdate"]/@content + +#GDPR cookies +http_header(Cookie): wp_devicetype=0; rplpwabt4=1; devicetype=0; osfam=0; de=; client_region=0; wp_gdpr=1|1; rplmct=1; washpost_poe=true; + +strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"] +strip://div[@id="wp-column six end"] +strip://div[contains(@class,'hidden')] +strip://div[@id='article-side-rail'] +strip://div[@class="module component todays-paper-module curved"] +strip://div[@class="module component live-qa curved img-border"] +strip://div[@class="module component newsletter-signup curved"] +strip://div[@class="module featured-stories component curved img-border"] +strip://h3[@property="dc.creator"] + +strip_id_or_class: carousel +strip_id_or_class: toolbar +strip_id_or_class: module + +# Change gJQAwdJG4U_story.html to gJQAwdJG4U_print.html +single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_print.html") +if_page_contains: //link[@rel="canonical" and contains(@href, '_story.html')] + +# [OLD] Change gJQAwdJG4U_story.html to gJQAwdJG4U_story_print.html +#single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_story_print.html") + +test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1 +test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html +test_url: http://www.washingtonpost.com/lifestyle/magazine/the-sorry-fate-of-a-tech-pioneer-halsey-minor-and-historic-virginia-estate-carters-grove/2012/05/30/gJQAwdJG4U_story.html diff --git a/data/GrabberConfig/web-libre.org.txt b/data/GrabberConfig/web-libre.org.txt new file mode 100644 index 00000000..9ed43a25 --- /dev/null +++ b/data/GrabberConfig/web-libre.org.txt @@ -0,0 +1,6 @@ +body: //div[@id='template_article'] + +strip_id_or_class: article_more +strip: //hr + +test_url: http://www.web-libre.org/dossiers/jacuzzi-gonflable,8493.html
\ No newline at end of file diff --git a/data/GrabberConfig/weblogs.asp.net.txt b/data/GrabberConfig/weblogs.asp.net.txt new file mode 100644 index 00000000..7cfa49d2 --- /dev/null +++ b/data/GrabberConfig/weblogs.asp.net.txt @@ -0,0 +1,9 @@ +title: //h2[@class="pageTitle"] +strip: //div[@class="postfoot"] +strip: //h2[@class="pageTitle"] +strip: //h3[@class="pageTitle"] +body: //div[@class="post"] +author: substring-before(substring-after(//div[@class="postfoot"], 'by'), 'Filed') +date: substring-before(substring-after(//div[@class="postfoot"], 'Published'), 'by') + +test_url: http://weblogs.asp.net/scottgu/archive/2011/08/31/html-editor-smart-tasks-and-event-handler-generation-asp-net-vnext-series.aspx
\ No newline at end of file diff --git a/data/GrabberConfig/webmasters.googleblog.com.txt b/data/GrabberConfig/webmasters.googleblog.com.txt new file mode 100644 index 00000000..ddc0462d --- /dev/null +++ b/data/GrabberConfig/webmasters.googleblog.com.txt @@ -0,0 +1,9 @@ +body://div[@id='main'] +date://div[@class='publishdate'] +strip://div[@class='share'] +strip://div[@class='post-footer'] +strip://div[@class='cmt_iframe_holder'] +strip://div[@class='blog-pager'] +strip://div[@class='clear'] +replace_string(noscript>): div> +test_url: https://webmasters.googleblog.com/2016/08/helping-users-easily-access-content-on.html diff --git a/data/GrabberConfig/webupd8.org.txt b/data/GrabberConfig/webupd8.org.txt new file mode 100644 index 00000000..b5c165da --- /dev/null +++ b/data/GrabberConfig/webupd8.org.txt @@ -0,0 +1,12 @@ +tidy: no +prune: no + +title: //h1[@class='post-title entry-title']/a +date: //time +author: //a[@rel='fn author']/span + +body: //div[@class='post-body'] + +strip: //iframe + +test_url: http://www.webupd8.org/2015/04/organize-your-unity-launcher-based-on.html diff --git a/data/GrabberConfig/webwereld.nl.txt b/data/GrabberConfig/webwereld.nl.txt new file mode 100644 index 00000000..40a5aa36 --- /dev/null +++ b/data/GrabberConfig/webwereld.nl.txt @@ -0,0 +1,8 @@ +strip: //*[@class="paginator"] +body: //*[@id="articleText"] +next_page_link: //a[@class="next"] + +# No author detection +# No publishing date detection +# No author and intro deduplication over multiple pages +test_url: http://webwereld.nl/analyse/111452/de-code-van-dorifel-nader-bekeken.html
\ No newline at end of file diff --git a/data/GrabberConfig/welt.de.txt b/data/GrabberConfig/welt.de.txt new file mode 100644 index 00000000..42e65e97 --- /dev/null +++ b/data/GrabberConfig/welt.de.txt @@ -0,0 +1,22 @@ +# set body +tidy: no +body: //div[contains(@class, 'articleContent')] + +# remove clutter +strip: //div[@class='advertising'] +strip: //div[@class='themenalarm'] +strip: //div[contains(@class, 'inTextTeaser')] + +# remove captions +strip: //span[@class='copyRight'] + +# remove photo galleries and extras +strip: //div[contains(@class, 'textGallery')] +strip: //div[contains(@class, 'videoGallery')] +strip: //div[contains(@class, 'imageGallery')] +strip: //div[contains(@class, 'openContent')] + +# remove comments +strip: //div[@id = 'writeComment'] + +test_url: http://www.welt.de/vermischtes/weltgeschehen/article11050589/27-Bergleute-in-neuseelaendischer-Mine-vermisst.html
\ No newline at end of file diff --git a/data/GrabberConfig/westernadvocate.com.au.txt b/data/GrabberConfig/westernadvocate.com.au.txt new file mode 100644 index 00000000..eb00f776 --- /dev/null +++ b/data/GrabberConfig/westernadvocate.com.au.txt @@ -0,0 +1,11 @@ +title: //header[contains(@class, "news-article-title")]//h1 +date: //div[@class="news-article-byline"]//time +author: //h2[@class="news-article-author"]//cite + +# Turns out that westernadvocate is doing funky things with the slide show images. :< +# body: //ul[@class="slides"]//img | //div[contains(@class, "news-article-body")] +body: //div[contains(@class, "news-article-body")] + +strip: //div[contains(@class, "flexslider")] + +test_url: http://www.westernadvocate.com.au/story/1532050/roos-accept-ziebell-ban-commentators-do-not/ diff --git a/data/GrabberConfig/what-if.xkcd.com.txt b/data/GrabberConfig/what-if.xkcd.com.txt new file mode 100644 index 00000000..a88a02c9 --- /dev/null +++ b/data/GrabberConfig/what-if.xkcd.com.txt @@ -0,0 +1,2 @@ +autodetect_next_page: no +test_url: http://what-if.xkcd.com/1/
\ No newline at end of file diff --git a/data/GrabberConfig/whatever.scalzi.com.txt b/data/GrabberConfig/whatever.scalzi.com.txt new file mode 100644 index 00000000..100a8c88 --- /dev/null +++ b/data/GrabberConfig/whatever.scalzi.com.txt @@ -0,0 +1,7 @@ +strip: //div[@class="navigation"] +strip: //div[@id="sidebar"] +strip: //div[@id="post-extra-content"] +strip: //div[@id="footer"] +strip: //div[contains(@class, "sharing")] + +test_url: http://whatever.scalzi.com/2011/01/09/quick-giffords-follow-up/
\ No newline at end of file diff --git a/data/GrabberConfig/wiki.guildwars.com.txt b/data/GrabberConfig/wiki.guildwars.com.txt new file mode 100644 index 00000000..b80fe5d1 --- /dev/null +++ b/data/GrabberConfig/wiki.guildwars.com.txt @@ -0,0 +1,8 @@ +title: //h1 +body: //div[@id='content'] +strip_id_or_class: editsection +strip_id_or_class: toc +strip: //div[@id='siteNotice'] +strip: //div[@id='content']//table[last()] +prune: no +test_url: http://wiki.guildwars.com/wiki/Monk
\ No newline at end of file diff --git a/data/GrabberConfig/wiki.guildwars2.com.txt b/data/GrabberConfig/wiki.guildwars2.com.txt new file mode 100644 index 00000000..e9233998 --- /dev/null +++ b/data/GrabberConfig/wiki.guildwars2.com.txt @@ -0,0 +1,8 @@ +title: //h1 +body: //div[@id='content'] +strip_id_or_class: editsection +strip_id_or_class: toc +strip: //div[@id='siteNotice'] +strip: //div[@id='content']//table[last()] +prune: no +test_url: http://wiki.guildwars2.com/wiki/Guardian
\ No newline at end of file diff --git a/data/GrabberConfig/wiki.obsd4a.net.txt b/data/GrabberConfig/wiki.obsd4a.net.txt new file mode 100644 index 00000000..90012069 --- /dev/null +++ b/data/GrabberConfig/wiki.obsd4a.net.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwiki.obsd4a.net%2Ftrad%3Afausses_croyances_des_developpeurs_a-propos_du_temps + +body: //div[contains(concat(' ',normalize-space(@class),' '),' page ')] +test_url: https://wiki.obsd4a.net/trad:fausses_croyances_des_developpeurs_a-propos_du_temps
\ No newline at end of file diff --git a/data/GrabberConfig/wikihow.com.txt b/data/GrabberConfig/wikihow.com.txt new file mode 100644 index 00000000..a19c7176 --- /dev/null +++ b/data/GrabberConfig/wikihow.com.txt @@ -0,0 +1,16 @@ +# ...&printable=yes +body: //div[@id='bodycontents'] +prune: no +tidy: no +strip_id_or_class: gatEditSection +strip_id_or_class: relatedwikihows +strip_id_or_class: editsection +#strip: //div[contains(@class, 'step_num')] + +#replace_string(<script ): <div style="display: none" +#replace_string(</script>): </div> + +single_page_link: //a[@id='gatPrintView'] +single_page_link: concat(//link[@rel='canonical']/@href, '?printable=yes') + +test_url: http://www.wikihow.com/Start-Your-Own-Country diff --git a/data/GrabberConfig/wikitravel.org.txt b/data/GrabberConfig/wikitravel.org.txt new file mode 100644 index 00000000..1f32a372 --- /dev/null +++ b/data/GrabberConfig/wikitravel.org.txt @@ -0,0 +1,14 @@ +# copied from .wikipedia.org.txt +title: //h1[@id='firstHeading' or @class='firstHeading'] +body: //div[@id = 'bodyContent'] +strip_id_or_class: editsection +#strip_id_or_class: toc +strip_id_or_class: vertical-navbox +strip: //table[@id='toc'] | //div[@id='p-toc'] +strip: //div[@id='catlinks' or @id='contentSub'] +strip: //div[@id='jump-to-nav'] +strip: //div[@class='thumbcaption']//div[@class='magnify'] +strip: //table[@class='navbox'] +prune: no +tidy: no +test_url: http://wikitravel.org/wiki/en/index.php?title=Bangkok&printable=yes
\ No newline at end of file diff --git a/data/GrabberConfig/will-self.com.txt b/data/GrabberConfig/will-self.com.txt new file mode 100644 index 00000000..394f9ca4 --- /dev/null +++ b/data/GrabberConfig/will-self.com.txt @@ -0,0 +1,4 @@ +strip: //div[@class="widget-area"] +title: //*[@class="entry-title"] +date: //time[@class="entry-date"] +test_url: http://will-self.com/2012/02/01/real-meals-dominos-pizza/
\ No newline at end of file diff --git a/data/GrabberConfig/winfuture.de.txt b/data/GrabberConfig/winfuture.de.txt new file mode 100644 index 00000000..dddc6f9e --- /dev/null +++ b/data/GrabberConfig/winfuture.de.txt @@ -0,0 +1,12 @@ +title: //h1/span + +body: //div[@id="news_content"] + +author: //div[@class="bookmarks_btm"]/p[1]/a[1]/text() + +date: //span[@class='date'] + +# Rubrikenbild entfernen +strip: //div[@id="news_content"]/a[1] + +test_url: http://winfuture.de/news,69672.html
\ No newline at end of file diff --git a/data/GrabberConfig/wired.com.txt b/data/GrabberConfig/wired.com.txt new file mode 100644 index 00000000..4342e82e --- /dev/null +++ b/data/GrabberConfig/wired.com.txt @@ -0,0 +1,35 @@ +title: //meta[@property="og:title"]/@content +author: //meta[@name='Author']/@content +date: //meta[@name='DisplayDate']/@content +body: (//article[contains(@class, 'body-copy')])[1] +body: (//article[contains(@class, 'article-body-component')])[1] +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ')] +strip: //p[contains(., 'Pages:') and contains(., 'View All')] +strip: //p[@class='caption'] +strip: //div[@class='desc' or @class='slide' or @id='slide-info'] + +strip_id_or_class: pullquote +strip_id_or_class: left_rail +strip_id_or_class: related-container +strip_id_or_class: radvert-caption-wrap +strip_id_or_class: related +strip_id_or_class: article-tags +strip_id_or_class: visually-hidden +strip_id_or_class: inset-left-component + +# Remove gallery? +strip_id_or_class: wpgallery + +#strip: //text()[contains(., 'nextpage')] + +prune: no + +# 2017-07-27 No longer used it seems +# single_page_link: //a[.='View All' and contains(@href, '/all/')] + +test_url: https://www.wired.com/2017/07/inside-cubas-diy-internet-revolution/ +test_contains: I should call the hostess once I know +test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ +test_url: http://www.wired.com/wiredenterprise/2013/09/docker/ +test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/ +test_url: http://www.wired.com/2015/11/i-turned-off-javascript-for-a-whole-week-and-it-was-glorious/ diff --git a/data/GrabberConfig/wired.de.txt b/data/GrabberConfig/wired.de.txt new file mode 100644 index 00000000..74293c06 --- /dev/null +++ b/data/GrabberConfig/wired.de.txt @@ -0,0 +1,7 @@ +title: //meta[@name='Title']/@content +author: //meta[@name='Author']/@content +date: //meta[@name='DisplayDate']/@content +body: //div[@class='article-content'] +strip: //div[@class='article-sidebar'] + +test_url: https://www.wired.de/collection/latest/der-vw-skandal-zeigt-ohne-vertrauen-ist-software-nicht-mehr-als-bose-magie diff --git a/data/GrabberConfig/wmpoweruser.com.txt b/data/GrabberConfig/wmpoweruser.com.txt new file mode 100644 index 00000000..70168fbe --- /dev/null +++ b/data/GrabberConfig/wmpoweruser.com.txt @@ -0,0 +1,4 @@ +date://*[@class="entry-date"] +author://*[@class="author vcard"] +strip://*[@style="position:relative;left:72px;top:2px;"]|//*[@id="authorbox"] +test_url: http://wmpoweruser.com/breaking-nokia-announces-nfc-support-in-lumia-610-windows-phone-device/
\ No newline at end of file diff --git a/data/GrabberConfig/wn.de.txt b/data/GrabberConfig/wn.de.txt new file mode 100644 index 00000000..ef18c8a5 --- /dev/null +++ b/data/GrabberConfig/wn.de.txt @@ -0,0 +1,18 @@ +author: //div[@id='main']//div[@class='col right']//div[contains(@class, 'attribute-author')] +body: //div[@id='main']//div[@class='col right'] +strip_id_or_class: boxes +strip_id_or_class: lazy +strip_id_or_class: comment_box +strip_id_or_class: fb_comments + +find_string: <noscript> +replace_string: <div> +find_string: </noscript> +replace_string: </div> + +prune: no +tidy: no + +test_url: http://www.wn.de/Muenster/Kultur/1742956-Wilm-Weppelmann-verlaesst-die-Einsiedelei-Und-dann-ab-unter-die-Dusche +# feed +test_url: http://www.wn.de/rss/feed/wn_muenster
\ No newline at end of file diff --git a/data/GrabberConfig/wordyard.com.txt b/data/GrabberConfig/wordyard.com.txt new file mode 100644 index 00000000..d8c753da --- /dev/null +++ b/data/GrabberConfig/wordyard.com.txt @@ -0,0 +1,8 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.wordyard.com%2F2014%2F09%2F26%2Fremove-blindfold-before-embarking-to-utopia%2F + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +strip_id_or_class: robots-nocontent +strip_id_or_class: post-revisions +test_url: http://www.wordyard.com/2014/09/26/remove-blindfold-before-embarking-to-utopia/
\ No newline at end of file diff --git a/data/GrabberConfig/worldcrunch.com.txt b/data/GrabberConfig/worldcrunch.com.txt new file mode 100644 index 00000000..e4024159 --- /dev/null +++ b/data/GrabberConfig/worldcrunch.com.txt @@ -0,0 +1,12 @@ + +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-subtitle ')] | //div[contains(concat(' ',normalize-space(@class),' '),' article-text ')] + +author: //meta[@property="article:author"]/@content | //a[contains(concat(' ',normalize-space(@class),' '),' original-author ')] + +date: //meta[@property="article:published"]/@content + +prune: no + +strip: //p/em[normalize-space(text())='See more from']/a[starts-with(@href,'https://www.worldcrunch.com/')]/parent::em + +test_url: https://www.worldcrunch.com/business-finance/why-philosophers-are-hot-profiles-on-corporate-job-market diff --git a/data/GrabberConfig/worldpoultry.net.txt b/data/GrabberConfig/worldpoultry.net.txt new file mode 100644 index 00000000..b88f9279 --- /dev/null +++ b/data/GrabberConfig/worldpoultry.net.txt @@ -0,0 +1,5 @@ +title: //div[@class="content article"]/h1 +date: substring-after(//*[@class='date'], '//') +body: //*[@class='article-content'] +strip: //*[@id='nomodal'] +test_url: http://www.worldpoultry.net/news/kyrgyzstan-restricts-poultry-imports-from-russia-and-kazakhstan-9332.html
\ No newline at end of file diff --git a/data/GrabberConfig/worldwidewords.org.txt b/data/GrabberConfig/worldwidewords.org.txt new file mode 100644 index 00000000..4682e0d3 --- /dev/null +++ b/data/GrabberConfig/worldwidewords.org.txt @@ -0,0 +1,4 @@ +title: //p[@id='content'] + +body: //div[@class='contentblock'] +test_url: http://www.worldwidewords.org/weirdwords/ww-gro1.htm
\ No newline at end of file diff --git a/data/GrabberConfig/wow.joystiq.com.txt b/data/GrabberConfig/wow.joystiq.com.txt new file mode 100644 index 00000000..44add9c9 --- /dev/null +++ b/data/GrabberConfig/wow.joystiq.com.txt @@ -0,0 +1,6 @@ +title: //h2[@class="posttitle"] +body: //div[@class="post"] +strip: //h2[@class="posttitle"] +strip: //p[@class="filed-under"] +convert_double_br_tags: yes +test_url: http://wow.joystiq.com/2011/06/20/the-overachiever-guide-to-midsummer-festival-2011-achievements/
\ No newline at end of file diff --git a/data/GrabberConfig/wpmayor.com.txt b/data/GrabberConfig/wpmayor.com.txt new file mode 100644 index 00000000..bb4fffc7 --- /dev/null +++ b/data/GrabberConfig/wpmayor.com.txt @@ -0,0 +1,8 @@ +body: //div[@id='nrelate_flyout_placeholder'] + +strip_id_or_class: share + +prune: no + +test_url: http://www.wpmayor.com/themes/wordpress-portfolio-resume-themes/ +test_url: http://www.wpmayor.com/feed/
\ No newline at end of file diff --git a/data/GrabberConfig/wsj.com.txt b/data/GrabberConfig/wsj.com.txt new file mode 100644 index 00000000..08d799f0 --- /dev/null +++ b/data/GrabberConfig/wsj.com.txt @@ -0,0 +1,33 @@ +title: //meta[@property="og:title"]/@content +body: //div[@id='wsj-article-wrap'] +# is this still used? +body: //div[@id='article_story_body'] + +author: //meta[@name="author"]/@content +# for slide show content +body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1] +date: //meta[@itemprop="dateCreated"]/@content + +strip_id_or_class: insetFullBracket +strip_id_or_class: insettipBox +#strip_id_or_class: legacyInset +strip_id_or_class: recipeACShopAndBuyText + +strip: //div[contains(@class, 'insetContent')]//cite +strip: //*[contains(@style, 'visibility: hidden;')] +strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))] +strip: //div[contains(@class, 'carousel')] + +# see https://elaineou.com/2017/01/19/how-the-twitter-app-bypasses-paywalls/ +http_header(user-agent): Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.1.32 (KHTML, like Gecko) Mobile/14C92 Twitter for iPhone +http_header(referer): https://t.co/T1323aaaa + +prune: no +tidy: no + +test_url: http://www.wsj.com/articles/airasia-flight-8501-tail-recovered-1420878809 +test_contains: Saturday evening that the black boxes +test_url: http://www.wsj.com/news/articles/SB10001424052702304626304579509100018004342 +test_url: http://www.wsj.com/article/SB10001424052970203363504577185322849515102.html +# slide show +test_url: http://www.wsj.com/article/SB10001424052970204791104577110550376458164.html diff --git a/data/GrabberConfig/wtatennis.com.txt b/data/GrabberConfig/wtatennis.com.txt new file mode 100644 index 00000000..1000ab26 --- /dev/null +++ b/data/GrabberConfig/wtatennis.com.txt @@ -0,0 +1,7 @@ +title: //h1[contains(@class, 'header-2')] +body: //article//*[contains(@class, 'teaserText') or contains(@class, 'lastUpdated') or contains(@class, 'image') or contains(@class, 'body')] +strip_id_or_class: articleIndex +prune: no + +test_url: http://www.wtatennis.com/news/article/3190914 +test_url: http://www.wtatennis.com/news/article/3190244
\ No newline at end of file diff --git a/data/GrabberConfig/www1.folha.uol.com.br.txt b/data/GrabberConfig/www1.folha.uol.com.br.txt new file mode 100644 index 00000000..97a5c19d --- /dev/null +++ b/data/GrabberConfig/www1.folha.uol.com.br.txt @@ -0,0 +1,15 @@ +body://div[@id='articleNew'] +strip://div[@id='articleBy'] +strip://div[@id='articleDate'] +strip://td[@class='articleGraphicCredit'] +strip://h1 +strip://div[@id='articleEnd'] +strip://p[@class='tagline'] +strip://div[@class='openBox adslibraryArticle'] +strip_id_or_class:ad-180x150-1 + + +title: //div[@id="articleNew"]/h1 +author: //div[@id="articleBy"]/p/b +date: substring-before(//div[@id="articleDate"], "-") +test_url: http://www1.folha.uol.com.br/mundo/1115805-ex-ditador-argentino-videla-e-condenado-a-50-anos-de-prisao.shtml
\ No newline at end of file diff --git a/data/GrabberConfig/www2.cnrs.fr.txt b/data/GrabberConfig/www2.cnrs.fr.txt new file mode 100644 index 00000000..313f6943 --- /dev/null +++ b/data/GrabberConfig/www2.cnrs.fr.txt @@ -0,0 +1,5 @@ +# Site configuration for CNRS press releases + +body: //div[@id="contenu"]//h2[@id="chapeau"] | //div[@id="contenu"]/div[@id="textContenu"] + +test_url: http://www2.cnrs.fr/presse/communique/5327.htm diff --git a/data/GrabberConfig/www3.imperial.ac.uk.txt b/data/GrabberConfig/www3.imperial.ac.uk.txt new file mode 100644 index 00000000..71306af2 --- /dev/null +++ b/data/GrabberConfig/www3.imperial.ac.uk.txt @@ -0,0 +1,2 @@ +strip_id_or_class: hidelabel +test_url: http://www3.imperial.ac.uk/newsandeventspggrp/imperialcollege/newssummary/news_14-7-2010-15-53-18
\ No newline at end of file diff --git a/data/GrabberConfig/wyborcza.pl.txt b/data/GrabberConfig/wyborcza.pl.txt new file mode 100644 index 00000000..638583dc --- /dev/null +++ b/data/GrabberConfig/wyborcza.pl.txt @@ -0,0 +1,9 @@ +body: //div[@id='article'] +strip: //div[@class='head'] + +strip_id_or_class: txt_upl + +single_page_link: //div[@id='gazeta_article_tools']//a[contains(@class, 'print')] + +test_url: http://wyborcza.pl/1,123455,11536088,Gdy_peknie_fejs__obryzga_wszystko.html?as=1&startsz=x +test_url: http://wyborcza.pl/1,75478,14880255,Biskup_Dydycz_o_pedofilii_i_tajemnicy_spowiedzi__Zamiast.html
\ No newline at end of file diff --git a/data/GrabberConfig/wz-newsline.de.txt b/data/GrabberConfig/wz-newsline.de.txt new file mode 100644 index 00000000..5b2be744 --- /dev/null +++ b/data/GrabberConfig/wz-newsline.de.txt @@ -0,0 +1,5 @@ +title://h1 + +date://p[@class='articleDate'] +body://div[@class='articleBody wzStandardArticle'] +test_url: http://www.wz-newsline.de/home/sport/tennis/federer-zum-vierten-mal-sieger-in-indian-wells-1.938050
\ No newline at end of file diff --git a/data/GrabberConfig/xatakaciencia.com.txt b/data/GrabberConfig/xatakaciencia.com.txt new file mode 100644 index 00000000..fbe2f8b0 --- /dev/null +++ b/data/GrabberConfig/xatakaciencia.com.txt @@ -0,0 +1,6 @@ +title: //h1//span +body: //div[contains(concat(' ',normalize-space(@class),' '),' article-content ')] + +replace_string(sf-src): src + +test_url: https://www.xatakaciencia.com/salud/esta-es-la-imagen-de-las-calorias-recomendadas-que-debes-consumir-en-un-fast-food diff --git a/data/GrabberConfig/xlsemanal.com.txt b/data/GrabberConfig/xlsemanal.com.txt new file mode 100644 index 00000000..610d03fb --- /dev/null +++ b/data/GrabberConfig/xlsemanal.com.txt @@ -0,0 +1,3 @@ +body: //section[contains(concat(' ',normalize-space(@class),' '),' post-content ')] + +test_url: http://www.xlsemanal.com/firmas/20171126/perez-reverte-el-hombre-que-si-estaba-alli.html diff --git a/data/GrabberConfig/xn--protin-bva.com.txt b/data/GrabberConfig/xn--protin-bva.com.txt new file mode 100644 index 00000000..0e4baa26 --- /dev/null +++ b/data/GrabberConfig/xn--protin-bva.com.txt @@ -0,0 +1,8 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=xn--protin-bva.com%2Fpost%2F2009%2F06%2F09%2FSAV-Free-un-sketch-kafkaien + +# pérotin.com.txt + +body: //div[contains(concat(' ',normalize-space(@class),' '),' post ')] +test_url: http://xn--protin-bva.com/post/2009/06/09/SAV-Free-un-sketch-kafkaien diff --git a/data/GrabberConfig/ynet.co.il.txt b/data/GrabberConfig/ynet.co.il.txt new file mode 100644 index 00000000..aa86566a --- /dev/null +++ b/data/GrabberConfig/ynet.co.il.txt @@ -0,0 +1,26 @@ +body: //span[@id='article_content' or @class='text16g'] + +# ads +strip: //div[.//div[contains(@id, 'ads.')]] +# related content heading +strip: //p[contains(., 'עוד בערוץ החדשות של ynet:')] +strip: //p[contains(., 'כותרות אחרונות מהעולם בחדשות ynet:')] +strip: //div[contains(., 'אינציקלופדיית ynet:')] +# related content links +strip: //a[@class='bluelink'] +# strip image bullets +strip_image_src: ynet_manual_bullet.png + +prune: no +tidy: no + +# prevent JS issues +find_string: <script type='text/javascript'> +replace_string: <div style="display:none;"> +find_string: </script> +replace_string: </div> + +test_url: http://www.ynet.co.il/articles/0,7340,L-4354266,00.html +test_url: http://www.ynet.co.il/articles/0,7340,L-4354268,00.html +#feed +test_url: http://www.ynet.co.il/Integration/StoryRss2.xml
\ No newline at end of file diff --git a/data/GrabberConfig/yostivanich.com.txt b/data/GrabberConfig/yostivanich.com.txt new file mode 100644 index 00000000..2aeb7e05 --- /dev/null +++ b/data/GrabberConfig/yostivanich.com.txt @@ -0,0 +1,5 @@ +title://div[@class='entry-title'] +body://div[@class='entry-content'] +strip_comments:yes +convert_double_br_tags:yes +test_url: http://www.yostivanich.com/2010/07/11/wired-com-with-world-watching-wikileaks-falls-into-disrepair/
\ No newline at end of file diff --git a/data/GrabberConfig/yourerie.com.txt b/data/GrabberConfig/yourerie.com.txt new file mode 100644 index 00000000..46ee5ba1 --- /dev/null +++ b/data/GrabberConfig/yourerie.com.txt @@ -0,0 +1,2 @@ +body: //div[@itemprop="articleBody"] +test_url: http://www.yourerie.com/news/news-article/d/story/cd-release-party-at-pi-downs/22898/G_gFL3mSQkWH_DW2wLuMOA diff --git a/data/GrabberConfig/youtube.com.txt b/data/GrabberConfig/youtube.com.txt new file mode 100644 index 00000000..b0d95f1f --- /dev/null +++ b/data/GrabberConfig/youtube.com.txt @@ -0,0 +1,15 @@ +title: //title +body: //iframe + +find_string: <html><iframe +replace_string: <iframe id="video" + +find_string: ></iframe></html> +replace_string: ></iframe> + +single_page_link: //link[@type='text/xml+oembed'] + +prune: no +tidy: no + +test_url: http://www.youtube.com/watch?v=F6gLH0r3iVU
\ No newline at end of file diff --git a/data/GrabberConfig/zdnet.com.txt b/data/GrabberConfig/zdnet.com.txt new file mode 100644 index 00000000..939fb0e3 --- /dev/null +++ b/data/GrabberConfig/zdnet.com.txt @@ -0,0 +1,10 @@ +title: //h1[@class="h s-1"] +author: substring-before(substring-after(//p[@class="meta s-10"], 'By'), '|') +author: substring-after(//div[@class="bio"]//h3, 'About ') +date: substring-after(//p[@class="meta s-10"], '|') +date: substring-after(//p[@class="meta"], '|') +body: //div[@class="content-1 entry space-1 clear"] +body: //div[@class="storyBody"] + +test_url: http://www.zdnet.com/blog/microsoft/the-bing-back-end-more-on-cosmos-tiger-and-scope/10920 +test_url: http://www.zdnet.com/researchers-find-web-tracking-up-privacy-down-7000000358/
\ No newline at end of file diff --git a/data/GrabberConfig/ze.tt.txt b/data/GrabberConfig/ze.tt.txt new file mode 100644 index 00000000..60ffee52 --- /dev/null +++ b/data/GrabberConfig/ze.tt.txt @@ -0,0 +1,16 @@ +tidy: yes +prune: yes +autodetect_on_failure: no + +#Remove ads +strip: //div[contains(@class, 'ph-ad')] +strip: //p//a[@target='_blank']//em + +title: //h1[contains(@class, 'ph-article-title')] +date: //time[contains(@class, 'ph-article-date')] +author: //div[contains(@class, 'ph-article-authors-wrapper')] +body: //div[contains(@class, 'ph-article-text-lead')] | //div[contains(@class, 'ph-article-text-body')] | //div[contains(@class, 'ph-article-image-wrapper')] + +test_url: http://ze.tt/fotograf-portraetiert-menschen-nach-12-und-3-glaesern-wein/ +test_url: http://ze.tt/fotoprojekt-menschen-in-mosambik-zeigen-selbst-wie-sie-der-duerre-trotzen/ +test_url: http://ze.tt/was-deine-tattoos-ueber-dich-aussagen/ diff --git a/data/GrabberConfig/zeit.de.txt b/data/GrabberConfig/zeit.de.txt new file mode 100644 index 00000000..33d108c5 --- /dev/null +++ b/data/GrabberConfig/zeit.de.txt @@ -0,0 +1,64 @@ +tidy: no +prune: no + +# Figures are wrapped in a noscript tag which is itself wrapped in a +# conditional comment. Feed readers will fail to parse this correctly +# so get rid of the noscript tag altogether. +replace_string(<noscript): <foo + +title: //meta[@property='og:title']/@content +date: //meta[@name='date']/@content + +strip: //span[@class='figure__copyright'] + +# Self advertisements +strip: //figure[@class='figure-stamp'] +strip: //a[contains(@title, 'Dieser Text ')] +strip: //a[contains(@title, 'Dieser Artikel ')] +strip: //span[@class='figure__text']/text()[contains(., 'Dieser Text ')] +strip: //span[@class='figure__text']/text()[contains(., 'Dieser Artikel ')] + +####################################### +# ZEIT: +####################################### + +single_page_link: //a[@class='article-toc__onesie'] + +author: //a[@class='byline__author']/span +author: substring-after(//span[@class='metadata__source'], 'Quelle: ') + +body: //main/article/div[@itemprop='articleBody'] + +strip: //a[@href='#'] +strip: //form[@id='newsletter-teaser-form'] +strip_id_or_class: 'article-pagination article__item' + +test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag +test_contains: In drei Minuten die Welt erobern +test_url: http://www.zeit.de/sport/2016-01/darts-wm-finale-anderson-lewis/komplettansicht +test_url: http://www.zeit.de/kultur/2015-12/selbstverwirklichung-optimierung-essay +test_url: http://www.zeit.de/2015/51/selbstdiagnose-gesundheit-zuhause-blut-urin-selbsttest +test_url: http://www.zeit.de/campus/2015/s2/nebenjob-master-studium-finanzierung +test_url: http://www.zeit.de/hamburg/politik-wirtschaft/2015-12/hamburg-wohnungsbau-fluechtlinge + +####################################### +# ZEIT MAGAZIN: +####################################### + +next_page_link: //meta[@rel='next']/@href + +author: //a[@class='article__head__meta__author']/span + +body: //main/article/div[@class='article__wrap'] + +strip: //div[@class='article__head-wrap']//h1 +strip_id_or_class: article__socialbox +strip_id_or_class: article__nextread nextread-base is-centered +strip_id_or_class: js-comments +strip_id_or_class: article__pagination is-constrained is-centered +strip_id_or_class: article__head__meta + +test_url: http://www.zeit.de/zeit-magazin/2015/51/daisy-ridley-traum +test_contains: Mein ganzes Leben lang habe +test_url: http://www.zeit.de/zeit-magazin/2015/52/cyberstalking-internet-stalker-familie-mierau +test_url: http://www.zeit.de/zeit-magazin/2017/46/harald-martenstein-ddr-toastbrot diff --git a/data/GrabberConfig/zenzla.com.txt b/data/GrabberConfig/zenzla.com.txt new file mode 100644 index 00000000..d3718792 --- /dev/null +++ b/data/GrabberConfig/zenzla.com.txt @@ -0,0 +1,6 @@ +# Generated by FiveFilters.org's web-based selection tool +# Place this file inside your site_config/custom/ folder +# Source: http://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.zenzla.com%2Flinux%2F857-sommes-agents-nsa.html + +body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')] +test_url: https://www.zenzla.com/linux/857-sommes-agents-nsa.html
\ No newline at end of file diff --git a/data/GrabberConfig/zerohedge.com.txt b/data/GrabberConfig/zerohedge.com.txt new file mode 100644 index 00000000..7e76aee5 --- /dev/null +++ b/data/GrabberConfig/zerohedge.com.txt @@ -0,0 +1,10 @@ +author: //span[@class='submitted']/a +strip: //div[@class='clear-block clr'] +strip: //div[@class='picture'] +strip: //span[@class='submitted'] +strip: //div[@class='breadcrumb'] +strip: //div[@class='fivestar-static-form-item'] +strip: //div[@class='js-links'] +strip: //div[@class='links clear-block clear'] +strip: //div[@class='block block-block'] +test_url: http://www.zerohedge.com/news/bernankes-columbus-voyage-end-monetary-policy-world
\ No newline at end of file diff --git a/data/GrabberConfig/zerokspot.com.txt b/data/GrabberConfig/zerokspot.com.txt new file mode 100644 index 00000000..afa964db --- /dev/null +++ b/data/GrabberConfig/zerokspot.com.txt @@ -0,0 +1,3 @@ +title: //h1 +body: //div[@id="primarycontent"] +test_url: http://zerokspot.com/weblog/2011/06/26/europython2011/
\ No newline at end of file diff --git a/data/GrabberConfig/zhihu.com.txt b/data/GrabberConfig/zhihu.com.txt new file mode 100644 index 00000000..3c9d8c1a --- /dev/null +++ b/data/GrabberConfig/zhihu.com.txt @@ -0,0 +1,19 @@ +# This filter is tested on: +# http://www.zhihu.com/question/19587406 +# http://www.zhihu.com/question/20649035 +# http://www.zhihu.com/question/20637942 + +author: //h3[@class='zm-item-answer-author-wrap'] +title://h2[@class='zm-item-title'] +date://a[@class='answer-date-link meta-item'] +convert_double_br_tags: yes + +wrap_in(blockquote)://div[@class='zm-editable-content'] +wrap_in(blockquote)://sup/text() +dissolve://sup + +strip://div[@class='zh-answers-title'] +strip:///div[@class='zm-item-vote-info '] +strip://div[@class='zm-item-answer-author-info'] +strip://div[@class='zu-blue-info-board zg-r3px'] +test_url: http://www.zhihu.com/question/20637942
\ No newline at end of file diff --git a/data/GrabberConfig/zive.cz.txt b/data/GrabberConfig/zive.cz.txt new file mode 100644 index 00000000..deef62d5 --- /dev/null +++ b/data/GrabberConfig/zive.cz.txt @@ -0,0 +1,9 @@ +body: //*[@class='ar-annotation'] | //div[contains(@class='ar-content')] +strip_id_or_class: ar-link-to-another +strip_id_or_class: ar-tags +find_string: Mohlo by vás zajímat: +replace_string: <!-- removed --> +next_page_link: //a[@data-tracker='Navigace,NextChapter'] + +test_url: http://www.zive.cz/clanky/tyden-zive-pocitac-ktery-pripomina-hamburger/sc-3-a-180652/default.aspx +test_url: http://www.zive.cz/clanky/6-specialnich-pc-na-doma-k-praci-i-na-hrani-vanoce-2015/sc-3-a-180617/default.aspx diff --git a/data/GrabberConfig/zoomit.ir.txt b/data/GrabberConfig/zoomit.ir.txt new file mode 100644 index 00000000..05af60ed --- /dev/null +++ b/data/GrabberConfig/zoomit.ir.txt @@ -0,0 +1,3 @@ +body: //div[@itemprop="image" or @itemprop="description" or @itemprop="articleBody"] + +test_url: http://www.zoomit.ir/2017/2/25/152960/tiny-engine-turns-natural-gas-into-hydrogen/ |