#!/usr/bin/perl
#
# All the cool kids are doing RSS. ME TOO!
#
use XML::RSS;
use XML::Atom::Feed;
use LWP::UserAgent;
use Date::Parse;
use POSIX;
use Digest::MD5 qw( md5_hex );
use Data::Dumper;
use strict;
use HTML::Entities;
use Unicode::String qw( utf8 );
use DBI;
use DBD::mysql;
use Storable qw( freeze );
use Encode qw( encode_utf8 decode_utf8 );
use HTML::TokeParser;
use Getopt::Long;
use Log::Log4perl qw(:easy);

BEGIN {
    # web user will not have HOME set, so be cheeky and assume it's my
    # own homedir that's required.
    if ( !( $ENV{HOME}||"" )) {
        my ( $name, $passwd, $uid, $gid, $quota, $comment, $gcos, $dir, $shell,
             $expire ) = getpwnam( "localwaider" );
        $ENV{HOME} = $dir;
    }
}
use lib "$ENV{HOME}/src/perl";
use WaiderDotIe qw( getconfig );

# set up python-like logging
Log::Log4perl->easy_init( { level => $INFO,
                            layout => '%p:root:%m%n' });

# IWBNI:
# * Users, with passwords and preferences and what not.
# * Clean up cache dir/database if a feed is removed
# * Clean out expired items

my $conf = getconfig();

# connect to the database
my $dbh = DBI->connect(
    "DBI:mysql:database=$conf->{database};host=$conf->{host};port=$conf->{port}" . ( $conf->{ssl} ? ';mysql_ssl=1' : '' ),
    $conf->{dbuser},
    $conf->{dbpass},
    {
        RaiseError => 1,
        ShowErrorStatement => 1,
#              HandleError => sub {
#                  my ( $err, $handle, $retval ) = @_;

#              },
        AutoCommit => 1,
    }
    ) or die $DBI::errstr;

# in case of flakiness
$dbh->{mysql_auto_reconnect} = 1;

# get the list of feeds + urls
my @urls;
my $feeds = $dbh->selectall_arrayref( 'SELECT name, url, feedid, error FROM feeds' )
    or die $DBI::errstr;
@urls = @{$feeds};

my $debug = 0;
my $reparse = 0;
my $refresh = 0;
my $network = 1;

GetOptions( 'debug!' => \$debug,
            'url=s' => sub {
                my $url = $_[1];
                my @newurls;
                if ( @newurls = grep  { $_->[1] eq $url } @urls ) {
                    @urls = @newurls;
                } else {
                    $dbh->do( 'INSERT INTO feeds(url) VALUES(?)', undef, $url );
                    my $feedid = $dbh->last_insert_id( undef, undef, undef, undef );
                    @urls = ( [ "New feed", $url, $feedid ] );
                }
            },
            'network!' => \$network,
            'refresh!' => \$refresh,
            'reparse!' => \$reparse ) or die;

if ( $debug ) {
    get_logger()->level($DEBUG);
}

my $only = shift;

$XML::RSS::AUTO_ADD = 1; # force acceptance of extra namespaces

my $ua = new LWP::UserAgent;
$ua->agent( "RSS/0.1 " . $ua->agent );
$ua->env_proxy();

# Piping hot!
$| = 1;

my $fileprefix; # boo. global.

for my $feed ( @urls ) {
    my ( $title, $url, $feedid, $lasterror ) = @{$feed};
    my ( $rss, $channel, $res, $req, $content );
    my $lastupdated = time;

    if ( defined( $only )) {
        DEBUG("> $feedid: $title\n");
        next unless $title eq $only;
    }

    # see what we have in the database
    my $feeddata =
        $dbh->selectall_arrayref( 'SELECT feedid, name, url, ' .
                                  'UNIX_TIMESTAMP(lastupdate), rawfeed, ' .
                                  'etag, modified, cleanfeed, charset ' .
                                  'FROM feeds WHERE feedid=?', undef,
                                  $feedid );

    # feed failure, just skip to the next one.
    if ( !$feeddata ) {
        next;
    }

    my %thisfeed =
        (
         feedid     => $feeddata->[0]->[0],
         name       => $feeddata->[0]->[1],
         url        => $feeddata->[0]->[2],
         lastupdate => $feeddata->[0]->[3],
         rawfeed    => $feeddata->[0]->[4],
         etag       => $feeddata->[0]->[5],
         modified   => $feeddata->[0]->[6],
         cleanfeed  => $feeddata->[0]->[7],
         charset    => $feeddata->[0]->[8],
        );

    if ( $refresh or !$thisfeed{rawfeed}) {
        $thisfeed{lastupdate} = undef;
        $thisfeed{rawfeed} = undef;
        $thisfeed{etag} = undef;
        $thisfeed{modified} = undef;
        $thisfeed{cleanfeed} = undef;
    }

    DEBUG("Doing '$thisfeed{name}' ($feedid)");
    $fileprefix = "/var/tmp/";
    if  ( $title ) {
        my $t = $title;
        $t =~ s@[^a-z0-9]@_@gi;
        $fileprefix .= $t;
    } else {
        $fileprefix .= $feedid;
    }

    # First, check against the cachefile whether we need to update or
    # not. This includes checking how often the cachefile itself says we
    # should update. NB this operates on the *cleaned* feed.
    if ( $thisfeed{cleanfeed} ) {
        my $updateFreq = 1800;  # minimum 30 minutes between refreshes
        $rss = new XML::RSS;
        eval {
            $rss->parse( $thisfeed{cleanfeed} ) or warn $!;
        };
        # check if it needs to be refreshed
        $channel = $rss->{channel};
        if ( defined( $channel )) {
            my $syn = $channel->{syn};
            if ( defined( $syn )) {
                $updateFreq = $syn->{updateFrequency};
                my $updatePeriod = $syn->{updatePeriod};

                if ( $updatePeriod eq 'hourly' ) {
                    $updateFreq *= ( 60 * 60 );
                } elsif ( $updatePeriod eq 'daily' ) {
                    $updateFreq *= ( 60 * 60 * 24 );
                } else {
                    warn "No idea what to do with $updatePeriod\n";
                    $updateFreq *= ( 60 * 60 * 24 );
                }
            }
        }

        # now check against the file date
        my $mtime = $thisfeed{lastupdate};
        $lastupdated = $mtime;
        if ( time > $mtime + $updateFreq ) {
            undef $rss;         # forces reload
        } else {
            DEBUG("  not due for update yet");
            next unless $only or $refresh or $reparse;
        }
    } else {
        DEBUG("  no clean feed found");
    }

    $lasterror = '' if $refresh;

    if ( !$rss and $network ) {
        $url =~ s/^feed:/http:/;
        DEBUG("  Fetching $url...");

        $req = new HTTP::Request GET => $url;

        # Support ETag and If-Modified-Since
        # No point if the cache file is missing, mind you.
        if ( $thisfeed{cleanfeed}||'' ) {
            if ( $thisfeed{etag}||'' ) {
                $req->push_header( 'If-None-Match', $thisfeed{etag} );
            }
            if ( $thisfeed{modified}||'' ) {
                $req->push_header( 'If-Modified-Since', $thisfeed{modified} );
            }
        }

        $res = $ua->request( $req );

        # check if we're being redirected...
        my $final_url = $res->base;

        if ( $res->is_success ) {
            $content = $res->decoded_content();
            $lastupdated = time;
            my ( $save, $saveres, @saveparams );

            if ( !$final_url->eq( $url )) {
                DEBUG("  looks like it moved to " . $final_url);
                $save = $dbh->prepare( 'UPDATE feeds SET url=?,rawfeed=?,error=NULL WHERE feedid=?' );
                push @saveparams, $final_url;
            } else {
                $save = $dbh->prepare( 'UPDATE feeds SET rawfeed=?,error=NULL WHERE feedid=?' );
            }
            push @saveparams, $content, $thisfeed{feedid};
            $saveres = $save->execute( @saveparams );

            if ( !defined( $saveres )) {
                die $DBI::errstr;
            }

            # save the ETag/Last-Modified bits
            for my $hdr ( "ETag", "Last-Modified", "Content-Type" ) {
                DEBUG("   $hdr: " . ( $res->headers->header( $hdr ) || "unset" ));
                if ( my $val = $res->headers->header( $hdr )) {
                    if ( $hdr eq "ETag" ) {
                        $save = $dbh->prepare( 'UPDATE feeds SET etag=? WHERE feedid=?' );
                    } elsif ( $hdr eq "Last-Modified" ) {
                        $save = $dbh->prepare( 'UPDATE feeds SET modified=? WHERE feedid=?' );

                        # snag the last-modified date if it's present
                        my $tval = str2time( $val );
                        $lastupdated = $tval if $tval;
                    } else {
                        $save = $dbh->prepare( 'UPDATE feeds SET charset=? WHERE feedid=?' );
                        $thisfeed{charset} = $val;
                    }

                    $saveres = $save->execute( $val, $thisfeed{feedid} );
                    if ( !defined( $saveres )) {
                        warn "saving headers: $DBI::errstr";
                    }
                }
            }
        } else {
            # Not modifed
            if ( $res->code == 304 ) {
                DEBUG("  Page not modified.");
                next unless $reparse;
            } else {
                DEBUG("  Failed to fetch page: " . $res->code . " " . $res->message);
                next;
            }
        }

        $reparse = 1;
    }

    if ( $reparse ) {
        # if the content is undefined, then snork it up from the cache file.
        if ( !defined( $content ) or $content eq "" ) {
            DEBUG("  Using cached version");
            if ( $thisfeed{rawfeed}||"" ) {
                $content = $thisfeed{rawfeed};
            } else {
                DEBUG("  No cached feed for $title!");
            }
        }

        # If we still have no content, flee the premises.
        if ( !defined( $content )) {
            DEBUG("No content");
            next;
        }

        my ( $base ) = $url =~ m@^(.*://?.+)/@;
        my ( $site ) = $url =~ m@^(.*://?[^/]+)/@;
        my $original = $content;
        my $preparsed;

        my ( $charset, $contenttype );
        if ( $contenttype = $thisfeed{charset} ) {
            if ( $contenttype =~ /\bcharset=([^ ;]+)/ ) {
                DEBUG("  Charset: $1");
                $charset = $1;
            } else {
                $charset = "";
            }
            $contenttype =~ s/;.*$//;
            DEBUG("  Cleaned Type: $contenttype");
        }

        if (( $contenttype ne "text/xml" ) &&
            # is it REALLY html?
            ( $content =~ /^<\?xml/ )) {
            $contenttype = "text/xml";
            DEBUG("  Fixed content type to $contenttype");
        }

        if ( $contenttype ne "text/xml" and $contenttype ne "application/xml" and $contenttype ne "application/atom+xml" ) {
            logerror( "content-type '$contenttype' incorrect", $lasterror, $thisfeed{feedid}, $url, $title, $content );
            next;
        }

        # if it's an Atom feed, switch it to RSS. This is horribly rough.
        if ( $content =~
             m@(http://purl.org/atom|xmlns=["']?http://www.w3.org/2005/Atom)@si ) {
            DEBUG("  Converting to RSS...");
            eval {
                $preparsed = atom_to_rss( $content, $url );
            };
            if ( $@ ) {
                logerror( "Atom conversion: " . $@, $lasterror, $feedid, $url, $title||"<unknown>", $content );
                next;
            }
            DEBUG("  Converting to RSS...done.");
        }

        # throw away everything before the first XML declaration so we
        # can get a clean parse if at all possible.
        $content =~ s/^.*?(<\?xml)/$1/s;

        # should be a one-off, but.
        $content =~ s/&pound/\&amp;pound/gs;

        # debugging
        if ( $debug ) {
            open( PREPARSE, ">$fileprefix.preparse" ) or warn $!;
            print PREPARSE $content;
            close( PREPARSE );
        }

        DEBUG("  Parsing RSS ");
        eval {
            # clean up fail-files
            unlink( "$fileprefix.failed" );
            unlink( "$fileprefix.parsed" );
            unlink( "$fileprefix.new" );

            if ( !defined( $preparsed )) {
                $rss = new XML::RSS( version => "1.0", encoding => 'UTF-8',
                                     encode_output => 1 );
                if ( !$rss->parse( $content )) {
                    my $err = "";
                    $err = $@ if $@;
                    $err .= " ($!)" if $! and $! ne "Success";
                    $err ||= "unknown error";
                    # Eval will catch
                    DEBUG(" $err");
                    die "\nparser failed: $err. File in $fileprefix.failed";
                }
            } else {
                $rss = $preparsed;
            }

            DEBUG($rss->{version});

            if ( $debug ) {
                open( PARSED, ">$fileprefix.parsed" ) or warn $!;
                print PARSED Dumper( $rss );
                close( PARSED );
            }

            # pubDate needs to be in strict Mail date format for RSS2
            $channel = $rss->{channel};
            if ( defined( $channel->{pubDate})) {
                DEBUG("  Fixing pubDate for channel: ");
                my $fixdate = str2time( $channel->{pubDate});
                if ( defined( $fixdate )) {
                    $channel->{pubDate} = strftime( "%a, %d %b %Y %H:%M:%S %z",
                                                    localtime( $fixdate ));
                } else {
                    # best we can do
                    $channel->{pubDate} =~ s/ (\d?\d:\d\d) / $1:00 /;
                }
                DEBUG($channel->{pubDate});
            }

            # XML::RSS barfs on feed images without titles. But it
            # barfs if there's no image tag, too. What a piece of
            # shit.
            delete $rss->{image}->{url}
            if defined( $rss->{image} );

            # Fix a variety of per-item bogosity
            map {
                # trim title
                $_->{title} =~ s/\s+$//s;
                $_->{title} =~ s/^\s+$//s;

                # ARGH. XML::RSS won't save items with blank titles.
                $_->{title} = "(untitled)" if !( $_->{title});

                # RTE: wtf?
                if ( exists( $_->{item} )) {
                    delete $_->{item};
                }

                # and the fucking stupidity with the entities
                #$_->{title} =~ s/&(?!(amp|gt|lt))/&amp;/gs;

                # RSS 2.0 gives a content:encoded block which contains
                # a formatted version of the post. Ideally I'd like to
                # use this. Both description and content:encoded have
                # entity escapes, though.
                my $ctag = $_->{content};
                if ( defined( $ctag ) and ref $ctag eq "HASH" and
                     defined( $ctag->{encoded})) {
                    $_->{description} = $ctag->{encoded};
                }

                # cope with 2.0 RSS feed.
                if ( defined( $_->{guid})) {
                    # don't overwrite good links
                    $_->{link} = $_->{guid} unless $_->{link};
                }

                # Some of the pubDate stuff is messed up.
                if ( defined( $_->{pubDate} )) {
                    my $fixdate = str2time( $_->{pubDate});
                    if ( defined( $fixdate )) {
                        $_->{pubDate} = strftime( "%a, %d %b %Y %H:%M:%S %z",
                                                  localtime( $fixdate ));
                    } else {
                        # best we can do
                        $_->{pubDate} =~ s/ (\d?\d:\d\d) / $1:00 /;
                    }
                }

                # HURGH. I don't know whose fault this is, but I don't like it.
                #$_->{description} =~ s/=&amp;quot;(.+?)&amp;quot;/="$1"/gis;

                # DIE PUNY HUMANS, so to speak. Turns out Warren ain't
                # alone. also, I should probably do this with a
                # parser.
                $_->{description} =~ s@(href|src|data)="/@$1="$site/@gs;
                $_->{description} =~
                    s@(href|src|data)="(?!(http|ftp))@$1="$base/@gs;

                # final cleanup: nuke leading/trailing space
                $_->{description} =~ s/^\s+//;
                $_->{description} =~ s/\s+$//;
            } @{$rss->{items}};

            if ( @{$rss->{items}}) {
                $rss->{charset} = $charset;
                timestamp( $title, $rss, $lastupdated, $thisfeed{feedid} );
            } else {
                die "No items in feed";
            }

            # this tosspot saves entites as unicode, causing further
            # irritation.
            $rss->{channel}->{title} =~ s/&(?!(amp|gt|lt))/&amp;/gs;
        };
        if ( $@ ) {
            logerror( "RSS parse: " . $@, $lasterror, $thisfeed{feedid}, $url, $title, $content,  );
            next;
        }

        # now check if it changed at all.
        my $newfeed = "";
        eval {
            $newfeed = $rss->as_string;
        };

        if ( $@ ) {
            warn "$title ($url): " . $@;
            if ( $@ =~ /not well-formed.*byte (\d+)|mismatched tag at line \d+, column \d+, byte (\d+)/ ) {
                DEBUG("Excerpt:");
                DEBUG(substr( $content, $1 - 20, 20 ) . "=>" . substr( $content, $1, 1 ) . "<=" . substr( $content, $1 + 1, 20 ));

            } else {
                DEBUG(Dumper( $rss ));
            }
            open( SAVED, ">$fileprefix.failed" ) or warn $!;
            print SAVED $content;
            close( SAVED );

            next;
        }

        if (( $thisfeed{cleanfeed}||'') ne $newfeed ) {
            my $save = $dbh->prepare( 'UPDATE feeds SET name=?,cleanfeed=? WHERE feedid=?' );
            my $saveres = $save->execute( $rss->{channel}->{title}||$title,
                                          $newfeed, $thisfeed{feedid} )
                or warn "Saving $title: $DBI::errstr";
        }

        # otherwise clean up any existing failed files
        unlink( "$fileprefix.failed" );

        if ( $debug ) {
            open( SAVED, ">$fileprefix.new" ) or warn $!;
            print SAVED Dumper( $newfeed );
            close( SAVED );
        }

        # housekeeping: the hdr file is invalid if we couldn't
        # successfully parse the RSS feed. On the other hand, if we did
        # parse the feed we don't need the fetched file.
        # no real database equivalent for this... check if there are
        # any items in the new feed, perhaps. We can also erase the
        # rawfeed column.
    } else {
        DEBUG("  Using data from " . scalar( localtime( $lastupdated )));
    }

    # make the timestamp on the file correct.
    $dbh->do( 'UPDATE feeds SET lastupdate=FROM_UNIXTIME(?) WHERE feedid=?',
              undef, $lastupdated, $thisfeed{feedid} ) or
                  warn( "timestamp update for $title failed ($DBI::errstr)" );
}

# Attempt to attach timestamps to untimestamped feeds. Won't do
# anything useful the first time through.
sub timestamp {
    my ( $feed, $rss, $lastupdated, $feedid ) = @_;
    my $expire = $dbh->do( 'UPDATE items SET active=0 WHERE feedid=' .
                           $feedid );

    DEBUG("  Timestamping $feed");

    for my $item (@{$rss->{items}}) {
        my ( $itemid, $ts, $date );

        DEBUG(Dumper($item));

        # fixme: this should be configurable
        if ( defined( $item->{dc} ) and defined( $item->{dc}->{date})) {
            $date = rss_date_to_unix( $item->{dc}->{date} );
            DEBUG("   Using dc date " . scalar( localtime( $date )));
        } elsif ( defined( $item->{pubDate})) {
            $date = rss_date_to_unix( $item->{pubDate});
            DEBUG("   Using pubdate " . scalar( localtime( $date )));
        } else {
            my ( $y, $m, $d );
            my $guid = $item->{guid}||$item->{link};
            my $title = $item->{title};
            my $desc = $item->{description};

            if ( defined( $guid )) {
                # dnalounge
                ( $y, $m, $d ) = $guid =~
                    m{(\d{4})/(\d{2}).html#(\d{2})};
                goto GLOM if $y and $m and $d;
            }

            if ( defined( $title )) {
                # arcamax doonesbury feed
                ( $m, $d, $y ) = $title =~
                    m{Doonesbury (\d+)/(\d+)/(\d+)};
                goto GLOM if $y and $m and $d;

                # doonesbury, I hate you
                next if $title eq "Past Stories";

                # jerkcity
                ( $d, $m, $y ) = $title =~
                    m{\b(\d{1,2})-(\w{3})-(\d{4})\b};
                goto GLOM if $y and $m and $d;

                # NTK
                ( $y, $m, $d ) = $title =~
                    m{\b(\d{4})-(\d{2})-(\d{2})};
                goto GLOM if $y and $m and $d;
            }

            if ( defined( $desc )) {
                # john shirley
                ( $d, $m, $y ) = $desc =~
                    m{\b(\d{1,2})-(\w{3})-(\d{4})\b};
                goto GLOM if $y and $m and $d;
            }

          GLOM:
            $date = sprintf( "%04d-%02d-%02d", $y, $m, $d )
                if $y and $m and $d;

            if ( defined( $date )) {
                $date = str2time( $date );
            }
        }

        if ( !defined( $date )) {
            my $feedfunc = $feed;
            $feedfunc =~ s/[^a-zA-Z]//gs;
            eval '$date = ts_' . $feedfunc . '( $item );';
            DEBUG("   Faked date from item: $date") if defined($date);
            $date = rss_date_to_unix( $date ) if $date;
        }

        # stuff dc->date
        if ( defined( $date ) and !defined( $item->{dc})) {
            my %dc;
            # stupid W3CDTF can't handle timezones :(
            $dc{date} = strftime( "%Y-%m-%dT%H:%M:%S GMT", gmtime( $date ));
            $item->{dc} = \%dc;
            DEBUG("   adding dc:date $date => " . $dc{date});
            $ts = $date;
        }

        # need to fake up a datestamp, catering for potential in-line utf8.
        # rawarticlehash is quite obviously a poor name at this point.
        my $text = $item->{description};
        my $p = new HTML::TokeParser( \$text );
        my $c = "";
        while ( my $t = $p->get_token()) {
            next unless $t->[0] eq "T";
            $c .= $t->[1];
        }

        # craziness for which I cannot easily account in generic code,
        # so it's just going to have to be specific.
        $c =~ s/([A-Z])&([A-Z])/$1&amp;$2/gs;

        my $food = encode_utf8(( squish( $item->{title}||'', 'old')) .
                               ( squish( $c, 'old')) .
                               ( $item->{link}||'' ));

        DEBUG("   Old digest calculation from '$food'");
        my $digest = md5_hex( $food );
        DEBUG("   Calculated old digest $digest");

        # wtf?
        $item->{link} =~ s/\s+$//;

        $food = encode_utf8(( squish( $item->{title}||'', 'new')) .
                            ( squish( $c, 'new')) .
                            ( ascii_only( $item->{link}||'' )));
        DEBUG("   Digest calculation from '$food'");
        my $newdigest = md5_hex($food);
        DEBUG("   Calculated digest $newdigest");

        my $itemdata = $dbh->selectall_arrayref( 'SELECT itemid, unix_timestamp(ts) FROM items WHERE feedid=? AND rawarticlehash=?', undef, $feedid, $newdigest );

        if ( defined( $itemdata ) and @{$itemdata} ) {
            $itemid = $itemdata->[0]->[0];
            $ts = $itemdata->[0]->[1] unless $ts;
        } else {
            if ( $newdigest ne $digest ) {
                my $itemdata = $dbh->selectall_arrayref( 'SELECT itemid, unix_timestamp(ts) FROM items WHERE feedid=? AND rawarticlehash=?', undef, $feedid, $digest );
                if ( defined( $itemdata ) and @{$itemdata} ) {
                    DEBUG("   Updating old digest");
                    $itemid = $itemdata->[0]->[0];
                    $ts = $itemdata->[0]->[1] unless $ts;
                }
            }
            $ts = $lastupdated unless $ts;
        }

        if ( defined( $itemid )) {
            DEBUG("   wait, this is already in the database!");
        }

        my $stamp;
        if ( !defined( $itemid )) {
            DEBUG("   generating timestamp for " . encode_utf8( $item->{title} ));
            $stamp = $dbh->prepare( 'REPLACE INTO items(feedid,rawarticle,rawarticlehash,subject,body,url,ts,updated,active) VALUES(?,?,?,?,?,?,FROM_UNIXTIME(?),FROM_UNIXTIME(?),1)' );
            #, { mysql_is_blob => [ 0, 1, 0, 0 ] });
            # existing date overrides
            if ( defined( $date )) {
                DEBUG("   Using item date " . scalar( localtime( $date )));
                $ts = $date;
            }
            my $frozen = freeze( $item );
            my $res = $stamp->execute( $feedid, $frozen, $newdigest,
                                       $item->{title},
                                       $item->{description},
                                       $item->{link}, $ts, time );
        } else {
            $stamp = $dbh->prepare( 'UPDATE items set feedid=?,rawarticle=?,rawarticlehash=?,subject=?,body=?,url=?,ts=FROM_UNIXTIME(?),updated=FROM_UNIXTIME(?),active=1 WHERE itemid=?' );
            my $frozen = freeze( $item );
            my $res = $stamp->execute( $feedid, $frozen, $newdigest,
                                       $item->{title},
                                       $item->{description},
                                       $item->{link},
                                       $ts, time, $itemid );
        }

        my $newid = $dbh->last_insert_id( "", "rss", "items", "itemid" );
        DEBUG("   inserted as item $newid")
            if $newid and $newid != ( $itemid || 0 );

        if ( !defined( $item->{dc})) {
            my %dc;
            #$dc{date} = strftime( '%Y-%m-%dT%H:%M+0000', gmtime( $ts )); # XXXX
            # stupid W3CDTF can't handle timezones :(
            $dc{date} = strftime( "%Y-%m-%dT%H:%M:%S GMT", gmtime( $ts ));
            $item->{dc} = \%dc;
            DEBUG("   patching in time $ts => " . $dc{date});
        }
    }

    # now clean up anything that's not active
    #my $expire = $dbh->do( 'DELETE FROM items WHERE active=0' );
}

# convert rss date to unix time_t
sub rss_date_to_unix {
    my $date = shift;
    my $inp = $date;

    # RSS 2.0 uses a totally different date format. On the plus side,
    # Date::Parse should be able to handle it unchanged.
    if ( $date =~ /^\d+-/ ) {
        # Blogger's variation: 2003-07-16 17:44:13Z
        $date =~ s/[T ](\d+:\d+):\d+Z$/T$1+00:00/; # Z = Zulu time = GMT
    }

    # str2date vs ISO8601
    $date =~ s/(\d)T(\d)/$1 $2/;
    $date =~ s/(\d\d):(\d\d)$/$1$2/;

    my $outp = str2time( $date );
    DEBUG("   in: $inp out: $outp");

    $outp;
}

# per-feed timestamps
# if a sub called ts_FeedName exists, it'll get called to try and
# extract a usable timestamp from an RSS item.
sub ts_Cloudiness {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|;
    if ( defined( $date )) {
        $date =~ s@/@-@g;
        $date =~ s@$@T00:00-0600@;
    }
    $date;
}

sub ts_KevLyda {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|;
    if ( defined( $date )) {
        $date =~ s@/@-@g;
        $date =~ s@$@T00:00+0000@;
    }
    $date;
}

# maybe it's time I just folded all these into ts_Bloxsom and added a
# bloxsom detector...
sub ts_Nanocrew {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|;
    if ( defined( $date )) {
        $date =~ s@/@-@g;
        $date =~ s@$@T00:00+0000@;
    }
    $date;
}

sub ts_DNALounge {
    my $item = shift;
    my $date = $item->{title};
    if ( defined( $date )) {
        $date =~ s/ \(.*\)//;
        $date = str2time( $date );
        if ( defined( $date )) {
            $date = strftime( '%Y-%m-%dT%H:%M-0800', gmtime( $date ));;
        }
    }
    $date;
}

sub ts_LungFish {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m/^.*\.(\d+)$/;
    if ( defined( $date )) {
        $date = strftime( '%Y-%m-%dT%H:%M-0800', gmtime( $date ));;
    }
    $date;
}

sub ts_RedMeat {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m/redmeat\/(\d{4}-\d{2}-\d{2})/;
    if ( defined( $date )) {
        $date = str2time( $date );
        if ( defined( $date )) {
            $date = strftime( '%Y-%m-%dT%H:%M-0000', localtime( $date ));;
        }
    }
    return undef;
}

sub atom_to_rss {
    my $feed = shift;
    my $url = shift;

    $feed =~ s/^.*?<\?xml/<?xml/gs;

    # thanks, guys, that'll do nicely
    $feed =~ s@<title(\s+type=["']text['"])>(.*?)</title>@"<title$1>". encode_entities($2) ."</title>"@egs;

    DEBUG("  Parsing feed...");
    my $atom = XML::Atom::Feed->new( \$feed ) or die "new(): " . $!;
    return undef if !defined( $atom->title );

    my $rss = new XML::RSS(
        version => '1.0',
        encoding => 'UTF-8',
        encode_output => 1,
        );
    my @links = $atom->link;
    while ( @links ) {
        last if $links[0]->type eq "text/html";
        shift @links;
    }

    DEBUG("  building XML::RSS object");

    $rss->channel
        (
         title => $atom->title,
         link => ( defined( $links[0] ) ? $links[0]->href :
                   $url ),
         description => $atom->subtitle,
         dc => {
             date => $atom->modified || $atom->updated,
             language => $atom->language,
         }
        );

    for my $entry ( $atom->entries ) {
        DEBUG("  adding entry");

        if ( !defined( $entry->content )) {
            if ( !defined( $entry->summary )) {
                $entry->content( "no content" );
            } else {
                $entry->content( $entry->summary );
            }
        }

        my %item = (
            title => $entry->title,
            link => $entry->link->href,
            description => $entry->content->body,
            dc => {
                date => $entry->issued || $entry->updated,
            },
            );

        if ( $entry->author ) {
            $item{dc}->{creator} = $entry->author->name;
        }

        $rss->add_item(  %item );
    }

    return $rss;
}

sub squish {
    my $text = shift;
    my $version = shift;

    $version ||= 'old';

    if ( $version eq 'old' ) {
        $text =~ s/[^[:word:][:space:]]//gs;
    } else {
        $text =~ s/\t+/ /gs;
        $text =~ s/\n+/ /gs;
        $text =~ s/[^a-zA-Z0-9_ ]//gs; # keeping pace with python
    }

    # whitespace cleanup
    $text =~ s/ +/ /gs;
    $text =~ s/^\s+//gs;
    $text =~ s/\s+$//gs;

    $text;
}

sub ascii_only {
    my $text = shift;
    my $response = '';
    for my $c (split(//, $text )) {
        $response .= $c if ord($c) < 128;
    }

    $response;
}

sub logerror {
    my $newerror = shift;
    my $lasterror = shift;
    my $feed = shift;
    my $url = shift;
    my $title = shift;
    my $content = shift;

    my $errst = $dbh->prepare( "UPDATE feeds SET error=? WHERE feedid=?" );

    if ( $newerror ) {
        DEBUG(sprintf("%s: %s (%s %s %s %s %s\n",
                      scalar(gmtime(time)),
                      $newerror,
                      $lasterror||"no last error",
                      $feed,
                      $url,
                      $title));
        my $res = $errst->execute( substr( $newerror, 0, 255 ), $feed );
        if ( !$res ) {
            die $DBI::errstr;
        }

        # did this break already?
        if ( !$lasterror ) {
            chomp( $newerror );
            if ( $newerror =~ /not well-formed.*byte (\d+)|at line \d+, column \d+, byte (\d+)/ ) {
                my $offset = ( $1 || $2 );
                print STDERR "Excerpt around byte $offset:\n";
                print STDERR substr( $content, $offset - 20, 20 ) . "=>" . substr( $content, $offset, 1 ) . "<=" . substr( $content, $offset + 1, 20 );
                print STDERR "\n";
            }
        } else {
            DEBUG("  feed failed and already logged:");
            DEBUG("   Last Error:\n  $lasterror\n  New Error:\n  $newerror");
        }
        open( SAVED, ">$fileprefix.failed" ) or warn $!;
        print SAVED $content;
        close( SAVED );
    } else {
        my $res = $errst->execute( "", $feed );
        die $DBI::errstr unless $res;
    }
}
