#!/usr/bin/perl -w
#
# This script is intended to assist you in renaming/retagging MP3
# files you've downl^Wripped from your legally-purchased CDs but
# neglected to tag/name correctly. It uses FreeDB to check what your
# file's supposed to be tagged as based on information gleaned from
# the file. If you've got a griprc, it uses that to determine what the
# file should be named.
#
# Basically, it's part of my ongoing quest to have my computer do damn
# near everything for me,
#
# 22/06/2003 Created
#
use lib "$ENV{HOME}/src/perl";
use LWP::UserAgent;
use HTML::TokeParser;
use URI::Escape;
use CDDB 1.15;                  # 1.15 required for file parsing
use MP3::Info;
use Getopt::Long;
use File::Basename;
use MP3::ID3Lib;
use bytes;       # because CDDB is ISO8859-1, everyone is. Tough luck.

use strict;

my $searchurl = 'http://www.freedb.org/freedb_search.php?';

$| = 1;
my $debug = 1;

# These are the current mirrors (January 2003)

# June 2005: not responding?
#  freedb.freedb.de
#  de.freedb.org

my @mirrorsites = qw(
  at.freedb.org
  au.freedb.org
  bg.freedb.org
  ca.freedb.org
  es.freedb.org
  fi.freedb.org
  lu.freedb.org
  no.freedb.org
  uk.freedb.org
  us.freedb.org
);

my $id;
my $force = 0;
my $track_override = 0;
GetOptions( "id=s" => \$id, "force!" => \$force,
            "track=i" => \$track_override );

# from Gronk's config.pl
my $force_lowercase = 0;
my $force_underscore = 0;
my $safechars =
  "a-z0-9" .                    # alphanumerics
  "_()";                        # punctuation

# lowercase ISO chars
for my $i ( 0xe0..0xff ) {
    $safechars .= pack( "C", $i );
}

# make sure we don't cripple ourselves in cddb.pl::string_to_file
if ( $force_underscore ) {
    $safechars .= "_";
} else {
    $safechars .= " ";
}

# Open a connection for each mirror
my @mirrors;
for my $mirror ( @mirrorsites) {
    my $cddbp = new CDDB( Host => $mirror, Port => 888 );
    if ( defined( $cddbp )) {
        push @mirrors, $cddbp;
    } else {
        print STDERR "Failed to connect to $mirror\n";
    }
}

my $cddbp = $mirrors[ 0 ];
my $ua = new LWP::UserAgent;
$ua->env_proxy();
my @fullmatch;

if ( my @discids = grep /discid/, @ARGV ) {
    my $discid = shift @discids;
    open( DISCID, $discid ) or die "$discid: $!";
    my $newid = <DISCID>;
    close( DISCID );
    chomp( $newid );
    if ( !defined( $id )) {
        $id = $newid;
    }
}

my ( $artist, $album, $track, $length, $tnum );
while ( my $file = shift ) {
    next if $file =~ /discid/;

    if ( -d $file ) {
        $file =~ s/\/$//; # canonicalise
        if ( opendir( DIR, $file )) {
            push @ARGV, map { "$file/$_" } grep !/^\.\.?$/, readdir( DIR );
            closedir( DIR );
            next;
        } else {
            die "can't open $file: $!\n";
        }
    }

    if ( ! -e $file ) {
        print "$file does not exist!\n";
        next;
    }
    print "$file\n";
    $#fullmatch = -1;
    my $tag = get_mp3tag( $file );
    if ( !defined( $tag )) {
        print "  No MP3 tags found!\n";
        my $filetype = `file '$file'`;
        chomp( $filetype );
        if ( $filetype !~ "MP3|MPEG Layer 3" ) {
            print "   doesn't appear to be an MP3 file, even\n";
            next;
        }
    }

    $artist = $tag->{'ARTIST'} || "";
    $album = $tag->{'ALBUM'} || "";
    $track = $tag->{'TITLE'} || "";
    $tnum = $tag->{'TRACKNUM'} || 0;

    # ARGH. Stupid API change demands stupid fix.
    for my $var qw( artist album track tnum ) {
        eval "if ( ref \$$var ) { \$$var = \$${var}->[-1]; \$$var =~ s/\\0\$//; };";
    }

    if ( !$track ) {
        $track = basename( $file );

        # attempt to strip out useless bits
        $track =~ s/\.mp3|\.m4a$//i; # extension

        if ( $tnum ) {
            print STDERR "discarding $tnum from $track\n";
            $track =~ s/\b$tnum\b(\s*-\s*)//;
        }

        if ( title_compare( $album, $track )) {
            $track = title_compare( $album, $track );
        }

        if ( title_compare( $artist, $track )) {
            $track = title_compare( $artist, $track );
        }
    }

    if ( !$artist ) {
        my ( $a, $t ) = split( /\s*-\s*/, $track );
        if ( $a and $t ) {
            $artist = $a;
            $track = $t;
        }
    }

	if ( defined( $tnum ) and $tnum !~ /^\d+$/ ) {
		$tnum =~ s/\/.*$//; # more muppets
	}

	if ( defined( $tnum ) and $tnum =~ /([^0-9])/ ) {
		print STDERR "Muppet alert, discarding $tnum.\n";
		$tnum = 0;
    }

	if ( $tnum == 0 ) {
	  if ( $track =~ /Track.*?(\d+)/ ) {
		$tnum = $1;
	  } elsif ( $track =~ /^(\d+[ -]+)/ ) {
		$tnum = $1;
	  } else {
	    print STDERR "uhoh, can't get a tnum\n";
      }
	}

    if ( defined( $track_override ) and $track_override ) {
        $tnum = $track_override;
    }

    my $mp3info = get_mp3info( $file );
    $length = $mp3info->{SECS}
      if $mp3info;
    $length ||= 0;

    if ( $debug ) {
        print "Extracted info:\n";
        print "Artist: $artist\n";
        print "Album: $album\n";
        print "Track: $track\n";
        print "Track number: $tnum Track Time: $length seconds\n";
    }

    my %guessed = (
                   "artist" => $artist,
                   "album" => $album,
                   "track" => $track,
                   "tnum" => $tnum,
                   "length" => $length,
                  );

    if ( defined( $id )) {
        my ( $g, $i ) = split( /[\/ ]/, $id );
        $g ||= "";
        $i ||= $id;

        # CDDB file or ID specified on the command line.
        my $disc_details;
        if ( open( XMCD, "<$id" )) {
            print "  Reading from file\n";
            binmode( XMCD );
            my @lines = <XMCD>;
            $disc_details = CDDB::parse_xmcd_file( \@lines );
        } else {
            print "  Reading from cache '$g' '$i'\n";
            $disc_details = getcddbfile_cache( $g, $i );
        }

        if ( !defined( $disc_details )) {
            die "erm. Can't figure out what to do with $id!\n"
        }

        # now we need to identify which track this is...
        my @bits; # genre, id, disc title, track title, duration, tracknum
        push @bits, "";         # we don't know the genre.
        push @bits, $disc_details->{discid};
        push @bits, $disc_details->{dtitle};

        $tnum = find_track_in_album( $disc_details, $file, \%guessed );

        if ( !defined( $tnum )) {
            print STDERR "Can't find this track on the album you specified...\n";
            if ( defined( $guessed{tnum})) {
                print STDERR "Going by existing track number, it should be:\n";
                print STDERR $disc_details->{ttitles}->[$guessed{tnum} - 1];
                print STDERR "\n";
            }
            if ( !$force ) {
                die "Stopped";
            } else {
                $tnum = $guessed{tnum};
            }
        }

        push @bits, $disc_details->{ttitles}->[$tnum - 1];
        push @bits, $disc_details->{seconds}->[$tnum - 1];
        push @bits, $tnum - 1;

        push @fullmatch, \@bits;
    } else {
        # try successively more desperate searches
        get_cddb_matches( $artist, $album, $track, $length );
        if ( !@fullmatch ) {
            get_cddb_matches( $artist, "", $track, $length );
            if ( !@fullmatch ) {
                get_cddb_matches( "", "", $track, $length );
                if ( !@fullmatch ) {
                    # at this point, it's likely that the trackname is
                    # hosed. try and build it from the filename.
                    my $strack = $track;
                    $track = $file;

                    # attempt to strip out useless bits
                    $track =~ s/\.mp3|\.m4a$//i; # extension

                    if ( $tnum ) {
                        $track =~ s/\b$tnum\b(\s*-\s*)//;
                    }

                    if ( title_compare( $album, $track )) {
                        $track = title_compare( $album, $track );
                    }

                    if ( title_compare( $artist, $track )) {
                        $track = title_compare( $artist, $track );
                    }

                    if ( $track ne $strack ) {
                        get_cddb_matches( "", "", $track, $length );
                    }
                }
            }
        }

        # prune fullmatch
        my %fullmatch;
        for my $match ( @fullmatch ) {
            my $dtitle = $match->[2];
            my $rdtitle = quotemeta( $dtitle );
            my @already = grep /^$rdtitle$/i, keys %fullmatch;
            if ( !@already ) {
                $fullmatch{$dtitle} = $match;
            }

            my $contender = $dtitle;
            for my $already ( @already ) {
                if ( $contender lt $already ) {
                    delete $fullmatch{$contender};
                    $fullmatch{$already} = $match;
                    $contender = $already;
                }
            }
        }

        @fullmatch = values %fullmatch;
    }

    if ( scalar ( @fullmatch ) == 1 ) {
        # dump out command line
        my $dtitle = $fullmatch[0]->[2];
        my $ttitle = $fullmatch[0]->[3];
        my ( $artist ) = split( '/', $dtitle );
        if ( $artist =~ /^various/i ) {
            ( $artist, $ttitle ) = split( '/', $ttitle, 2 );
        }
        ( undef, $dtitle ) = split( '/', $dtitle, 2 );

        # slop
        $dtitle =~ s/^\s+//;
        $dtitle =~ s/\s+$//;
        $artist =~ s/^\s+//;
        $artist =~ s/\s+$//;
        $ttitle =~ s/^\s+//;
        $ttitle =~ s/\s+$//;

        # string escapes
        $dtitle =~ s/"/\\"/g;
        $artist =~ s/"/\\"/g;
        $ttitle =~ s/"/\\"/g;
        $file   =~ s/"/\\"/g;

        # fixme: include genre
        print "CDDB ID : " . $fullmatch[0]->[1] . "\n";
        if ( abs( $fullmatch[0]->[4] - $length ) > 1 ) {
            print "length mismatch: $length should be " . $fullmatch[0]->[4] . "\n";
        }
        print "mp3name.pl --nodry-run";
        print " --album \"";
        print $dtitle;
        print "\" --artist \"";
        print $artist;
        print "\" --title \"";
        print $ttitle;
        print "\" --track ";
        print $fullmatch[0]->[5] + 1;
        print " \"$file\"";
        print "\n";
    } else {
        print scalar(@fullmatch) . " match(es) found\n";
        for my $details ( @fullmatch ) {
            print "  =================================\n";
            print "  | file details:\n  | " . $details->[2] . " / " . $details->[3] . "\n";
            print "  | track number: " . ($details->[5] + 1) . "\n";
            print "  | length: " . $details->[4] . "\n";
            print "  | discid file should contain " . $details->[1] . "\n";
            print "  =================================\n";
            print "\n";
        }
    }
}

# avoid stupid warning message.
while ( @mirrors ) {
    my $cddbp = shift @mirrors;
    $cddbp->disconnect();
}

# get a single CDDB file matching GENRE, ID and check it for TITLE, LENGTH
sub getcddbfile {
    my ( $genre, $id ) = split( ' ', $_[0] );
    my $title = $_[1];
    my $length = $_[2];
    my $artist = $_[3];

    # Do the lookup
    my $disc_details = getcddbfile_cache( $genre, $id );

    if ( defined( $disc_details )) {
        my @ttitles = @{$disc_details->{ttitles}};
        my $offset = 0;
        for my $ttitle ( 0 .. $#ttitles ) {
            $ttitles[$ttitle] ||= "blank";

            my $ltext = lc( $ttitles[$ttitle] );
            my $ltitle = lc( $title );

            printf( "%2d. %s %ds", ( $ttitle + 1 ), $ttitles[$ttitle],
                    $disc_details->{seconds}->[$ttitle] );

            $ltext = title_compare( $ltitle, $ltext );

            $ltext =~ s/^CD$//i; # acceptable lossage

            if ( $ltext ) {
                $ltext = title_compare( lc( $artist ), $ltext );
                if ( $ltext ) {
                    print "\n -> post-fuzzy track match: $ltext ";
                }
            }

            if ( !$ltext ) {
                # gronk allows one second of slop
                if ( $length and abs( $disc_details->{seconds}->[$ttitle] - $length ) > 1 ) {
                    printf( " -> MATCHED, BUT WRONG LENGTH! (%f vs %f)",
                            $disc_details->{seconds}->[$ttitle], $length );
                } else {
                    print " -> MATCH!";
                    push @fullmatch, [ $genre, $id, $disc_details->{dtitle}, $ttitles[$ttitle], $disc_details->{seconds}->[$ttitle], $ttitle ];
                }
            } elsif ( index( lc( $ttitles[$ttitle]), lc( $title ), 0 ) != -1 ) {
                print " -> PARTIAL MATCH!";
                push @fullmatch, [ $genre, $id, $disc_details->{dtitle}, $ttitles[$ttitle], $disc_details->{seconds}->[$ttitle], $ttitle ];
            } else {
                print " -> didn't match $title";
                print " ${length}s" if $length;
            }
            print "\n";
        }
    } else {
        print STDERR "Can't get $genre $id\n";
    }
}

# search for possible CDDB matches
sub get_cddb_matches {
    my ( $artist, $album, $track, $length ) = @_;

    my $searchstring = "$artist $album $track";

    my $save = $searchstring;
    $searchstring =~ s/\bthe\b//gi;
    if ( !$searchstring ) {
        $searchstring = $save;
    }
    $searchstring =~ s/[\(\)]//g;
    $searchstring =~ s/^\s+//;
    $searchstring =~ s/\s+$//;
    $searchstring =~ s/\s+/ /g;
    print "  -> Web search for $searchstring\n";
    $searchstring = uri_escape( $searchstring );
    my $fields = "";
    $fields .= "fields=artist" if $artist;
    $fields .= "&" if $fields;
    $fields .= "fields=title" if $album;
    $fields .= "&" if $fields and substr( $fields, -1 ) ne "&";
    $fields .= "fields=track" if $track;

    my $req = new HTTP::Request
      GET => $searchurl .
        "grouping=none&allcats=YES&allfields=NO&$fields&words=$searchstring";

    my $res = $ua->request( $req );

    my @match;
    if ( $res->is_success ) {
        my $content = $res->content;
        my $parser = new HTML::TokeParser( \$content );
        my $previous_dtitle = "";
        if ( $debug ) {
            open( DUMP, ">$ENV{HOME}/tmp/cdthingdebug.html" );
            print DUMP $content;
            close( DUMP );
        }
        while ( my $tag = $parser->get_tag( "a" )) {
            next unless $tag->[1]->{class};
            if ( $tag->[1]->{class} =~ m@searchResultTopLinkA@i ) {
                my $text = $parser->get_trimmed_text( "/a" );
                if ( $text =~ /^\d+$/ ) {
                    $text = $previous_dtitle;
                }
                $previous_dtitle = $text;
                print "Found>: $text";

                while ( $tag = $parser->get_tag( "b" )) {
                    if ( $parser->get_trimmed_text( "/b" ) eq "Disc-ID:" ) {
                        $id = $parser->get_trimmed_text( "/a" );
                        $id =~ s/\///;
                        $id =~ s/\s+/ /;
                        last;
                    }
                }
                print " ($id)";
                if ( lc($text) eq lc( "$artist / $album" )) {
                    print " -> match!\n";
                    push @match, $id;
                    getcddbfile( $id, $track, $length, $artist );
                } else {
                    # try fuzzy matching
                    my $ltext = lc( $text );
                    my $lartist = lc( $artist );
                    my $lalbum = lc( $album );

                    # high stupidity in action.
                    if ( $text =~ /^various(\s+artists)*\s*(\(.*?\))/i ) {
                        $ltext =~ s/$2//i;
                    }

                    if ( !$lartist ) {
                        $ltext =~ s@^.*/\s*@@;
                    } elsif ( !$lalbum ) {
                        $ltext =~ s@/\s*.*$@@;
                    }

                    if ( $lartist or $lalbum ) {
                        $ltext = title_compare( $lartist, $ltext );
                        $ltext = title_compare( $lalbum, $ltext );

                        $ltext =~ s/\bCD\b//i; # acceptable lossage
                        $ltext = title_compare( "various artists", $ltext );
                        $ltext = title_compare( "various", $ltext );
                    } else {
                        $ltext = "";
                    }

                    if ( $ltext ) {
                        if ( $text =~ /soundtrack/ ) {
                            $ltext = title_compare( "original", $ltext );
                            $ltext = title_compare( "motion picture", $ltext );
                        }
                    }

                    if ( $ltext ) {
                        if ( $ltext =~ /remixed by/i ) {
                            $ltext =~ s/\s*remixed by\s*//i;
                        }
                    }

                    if ( $ltext ) {
                        print "\n -> post-fuzzy title match: [$ltext]";
                    } else {
                        print "\n";
                        push @match, $id;
                        getcddbfile( $id, $track, $length, $artist );
                    }

                    print "\n";
                }
            }
        }
    } else {
        warn $res->code;
        next;
    }
}

# This is from Gronk's cddb.pl
# Given a piece of text (a song title or band name) converts it to something
# usable as a file name using the same algorithm as Grip: downcase, delete
# all non-alphanumerics, and map space to underscore.
#
sub string_to_file {
    local $_ = shift;           # protect $_

    # note: do not localize this: Grip doesn't.

    tr/A-Z/a-z/ if $force_lowercase; # downcase
    s@ @_@g if $force_underscore; # now map space to underscore
    s/[^$safechars]//gi;        # delete unsafe chars
    return $_;
}

# fuzzy match for movie titles, will use for album title comparison
sub title_compare {
    my @args = @_;
    map {
        $_ = uc( $_ );
        s/\(\d{4}\)//;          # year
        s/[^A-Z0-9]/ /g;
        s/\s+/ /g;
        s/^\s+//;
        s/\s+$//;
    } @args;

    my @twords1 = split( /\s/, $args[0] );
    my $t2 = $args[1];

    for my $word ( @twords1 ) {
        $t2 =~ s/\b$word\b//;
    }

    $t2 =~ s/\s+/ /g;
    $t2 =~ s/^\s+//;
    $t2 =~ s/\s+$//;

    # if there's anything left in t2 at this point, we've probably got
    # a mismatch.
    return $t2;
}

# caching CDDB files
sub getcddbfile_cache {
    my ( $gen, $id ) = @_;

    my $site = shift @mirrors;
    push @mirrors, $site;

    my @genres;

    # if genre is undefined, cycle over the lot of 'em.
    if ( !$gen ) {
        @genres = $site->get_genres();
    } else {
        push @genres, $gen;
    }

    my @results;
    for my $genre ( @genres ) {
        my $disc_details;

        if ( $#genres > 0 ) {
            print STDERR "  trying $genre/$id\n" if $debug;
        }
        $site = shift @mirrors;
        push @mirrors, $site;

        my @cache = (
                     "$ENV{HOME}/.cddb/$id",
                     "$ENV{HOME}/.cddb/$genre/$id",
                     "$ENV{HOME}/.cddb-web/$genre/$id",
                    );

        for my $cachefile ( @cache ) {
            if ( open( XMCD, "<$cachefile" )) {
                binmode( XMCD );
                my @lines = <XMCD>;
                my $lines = \@lines;
                $disc_details = CDDB::parse_xmcd_file( \@lines );
                close( XMCD );
                print "  unable to parse $cachefile!\n" unless $disc_details;
                last if $disc_details;
            } else {
                print "  unable to open $cachefile ($!)\n"
                  if -f $cachefile;
            }
        }

        if ( !defined( $disc_details )) {
            print "  fetching $id from " . $site->{host} . "...";
            $disc_details = $site->get_disc_details( $genre, $id );
            print "done\n";

            # now cache for future use
            if ( defined( $disc_details )) {
                print STDERR "   saving $genre/$id\n" if $debug;
                -d "$ENV{HOME}/.cddb-web" or
                  mkdir "$ENV{HOME}/.cddb-web", 0755;
                -d "$ENV{HOME}/.cddb-web/$genre" or
                  mkdir "$ENV{HOME}/.cddb-web/$genre", 0755;
                if ( open( XMCD, ">$ENV{HOME}/.cddb-web/$genre/$id" )) {
                    print XMCD $disc_details->{xmcd_record};
                    close( XMCD );
                }
            }
        }

        push @results, $disc_details if $disc_details;
    }

    if ( $#results > 0 ) {
        die "Multiple results found for $id\n";
    } else {
        return $results[0];
    }
}

# used when we know for certain that the track we want is in the
# specified album. returns the track number.
sub find_track_in_album {
    my $disc_details = shift;
    my $file = shift;
    my $guessed = shift;

    my $mp3info = get_mp3info( $file );
    my $length = $mp3info->{SECS};
    my $id3 = new MP3::ID3Lib( $file );

    for my $ttitle ( 0 .. $#{$disc_details->{ttitles}}) {
        my $ondisc_name = $disc_details->{ttitles}->[$ttitle];
        my $ondisc_len  = $disc_details->{seconds}->[$ttitle];

        # if there's an exact match, but the lengths don't match,
        # maybe there's another track with the same title? Really need
        # to keep a list of possible matches and then postprocess it. XXX
        if ( $ondisc_name eq $guessed->{track} ) {
            print "Name match (track " . ( $ttitle + 1 ) . ")...";
            if ( $ondisc_len != $length ) {
                print "length mismatch ($ondisc_len vs $length), throwing it back\n";
#                next;
            }
            print "\n";
        }

        my $ltext = title_compare( $guessed->{track}, $ondisc_name );

        if ( $ltext ) {
            if ( title_compare( $ltext, $ondisc_name )) {

                # cool, partial match. let's try tossing the artist.
                $ltext = title_compare( $guessed->{artist}, $ltext );

                if ( !$ltext ) {
                    return $ttitle + 1;
                }

                # stupid, but it happens
                if ( $ltext =~ /\bmix(ed)?\b/i and
                     $ondisc_name =~ /\bremix(ed)?\b/i ) {
                    return $ttitle + 1;
                }

                if ( $ltext =~ /\bremix(ed)?\b/i and
                     $ondisc_name =~ /\bmix(ed)?\b/i ) {
                    return $ttitle + 1;
                }

                # close enough for government work
                if ( abs( $ondisc_len - $guessed->{length}) <=1 ) {
                    return $ttitle + 1;
                }

                # substrings, woo. this is actually bad, and stupid,
                # and whatnot, but will do for now.
                my $glc = quotemeta( lc( $guessed->{track}));
                my $olc = lc( $ondisc_name );
                if ( $olc =~ /^$glc/ ) {
                    return $ttitle + 1;
                }
            }
        } else {
            # exact match
            return $ttitle + 1;
        }
    }

    return undef;
}
