#!/usr/bin/perl # # Blogs.pm: # List of blogs and how to treat links coming off them. # # Copyright (c) 2002 Chris Lightfoot. All rights reserved. # Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/ # # $Id: Blogs.pm,v 1.14 2003/03/24 23:48:26 chris Exp $ # package BIMBO::Blogs; use URI::URL; %sites = ( slashdot => [ # URL of site "http://slashdot.org/", # Short name of site "Slashdot", # Subroutine to accept/reject URLs. sub ($) { my $u = url($_[0]); return ($u->host !~ m#(animefu\.com|penny-arcade\.com|bigempire\.com|themes\.org|thinkgeek\.com|oldmanmurray\.|dice\.com|freshmeat\.net|slashdot\.org|osdn\.|cmdrtaco\.|sourceforge\.net|everything2\.com|doubleclick\.net)#i); } ], memepool => [ "http://memepool.com/", "Memepool", sub ($) { my $u = url($_[0]); return !($u->host =~ m#memepool\.com#i); } ], charliestross => [ "http://www.antipope.org/charlie/blosxom.cgi", "Charlie Stross's web log", sub ($) { my $u = url($_[0]); return ($_[0] !~ m#antipope\.org/charlie/blosxom\.cgi# and $u->host !~ m#quicktopic\.com# # I'm sure the book is great but we don't want to buy it from here. and $u->host !~ m#amazon\.(co\.uk|com)# # Who wants to read pages of Google search results? and $u->host !~ m#www\.google\.(co\.uk|com)#); } ], megpickard => [ "http://www.notsosoft.com/blog/", "Meg Pickard's web log", sub ($) { # Only external links. return ($_[0] !~ m#^http://.*notsosoft\.com/#); } ], bbcnews => [ "http://news.bbc.co.uk/2/low.html", "BBC World News", sub ($) { # only news stories within the BBC. return ($_[0] =~ m#^http://news.bbc.co.uk/2/low/[^/]+/\d+.stm#); } ], b3ta => [ "http://www.b3ta.com/", "B Three T A", sub ($) { my $u = url($_[0]); return !($u->host =~ m#b3ta\.com#i); } ], meghourihan => [ "http://www.megnut.com/", "Meg Hourihan's web log", sub ($) { my $u = url($_[0]); return !($u->host =~ m#megnut\.com#i or $u->host =~ /amazon/); # sod buying books } ], salon => [ "http://www.salon.com/", "Salon magazine", sub ($) { # only want Salon stories, which have a date in the URL. # looks like `Tom Tomorrow' is premium content, which sucks. return ($_[0] =~ m#http://www.salon.com/.*/\d\d\d\d/\d\d/\d\d# and $_[0] !~ m# http://www\.salon\.com/comics/tomo/# and $_[0] !~ /print\.html$/ # don't want to grab both printable and non-printable version and $_[0] !~ /index_np\.html$/); # ditch premium content } ], straightdope => [ "http://www.straightdope.com/", "The Straight Dope", sub ($) { return ($_[0] =~ m#http://www\.straightdope\.com/(mailbag|classics|columns)/.*\.html#); } ], slate => [ "http://slate.msn.com/", "Microsoft Slate", sub ($) { # only stories within Slate (the URL format has changed...) return ($_[0] =~ m#^http://slate\.msn\.com\/id\/\d+\/$#); } ], onlineblog => [ "http://www.onlineblog.com/", "Guardian Online web log", sub ($) { # only external links. my $u = url($_[0]); return ($u->host !~ /onlineblog\.com/i); } ], scottrosenberg => [ "http://blogs.salon.com/0000014/", "Scott Rosenberg's Links and Comment", sub ($) { # only external links my $u = url($_[0]); return ($u->host !~ /salon\.com$/); } ], usrbingirl => [ "http://www.stormwerks.com/linked/", "/usr/bin/girl", sub ($) { # only external links my $u = url($_[0]); return ($u->host !~ /stormwerks\.com$/); } ], newscientistnews => [ "http://www.newscientist.com/news/", "New Scientist news", sub ($) { return ($_[0] =~ m#newscientist\.com/news/news\.jsp\?id=ns\d+#); } ], wired => [ "http://www.wired.com/", "Wired", sub ($) { my $u = url($_[0]); # Wired stories and external links. return ($_[0] =~ m#^http://.*\.hotwired\.com/r/wn_html_link/http://www\.wired\.com/news/# or $u->host !~ /(doubleclick|wired|lycos)\.(com|net)/ or $_[0] =~ m#^http://www\.wired\.com/news/.*\.html$#); } ], rebecca => [ "http://www.rebeccablood.net/", "What's in Rebecca's pocket?", sub ($) { my $u = url($_[0]); # Only external links. return ($u->host !~ /rebeccablood\.net$/); } ], googlenews => [ "http://news.google.com/", "Google News", sub ($) { my $u = url($_[0]); return ($u->host !~ /google\.com$/); } ], atlanticcurrent => [ "http://www.theatlantic.com/issues/current/", "The Atlantic Online (current issue)", sub ($) { return $_[0] =~ m#^http://www\.theatlantic\.com/issues/\d+/\d+/.*\.htm$#; } ], atlantic => [ "http://www.theatlantic.com/", "The Atlantic Online", sub ($) { return $_[0] =~ m#^http://www\.theatlantic\.com/(unbound|issues)/.*\.htm$#; } ], oblomovka => [ "http://www.oblomovka.com/", "Danny O'Brien's web log", sub ($) { my $u = url($_[0]); return ($u->host !~ /oblomovka\.com$/) } ], # OK. Blogging is a stupid popularity contest. So let's ``leverage'' MIT's # efforts to ``measure'' it. blogdex => [ "http://blogdex.media.mit.edu/", "MIT Blogdex", sub ($) { # only want links off blogdex.... return ($_[0] !~ m#^http://blogdex\.media\.mit\.edu/#i and $_[0] !~ m#^http://(www\.)?blogdex\.net/#i); } ], metafilter => [ "http://www.metafilter.com/", "MetaFilter", sub ($) { # links off metafilter return ($_[0] !~ m#^http://[^./]+\.metafilter\.com/#i); } ] ); 1;