#Archana Sharma-Oates 2008 #This perl scripts extracts texts and links from a specified html page #It then splits publication references into Title, year, authors' names #and journal reference. These are written to a text file in the Endnote format. #!/usr/bin/Perl use LWP::Simple; use HTML::LinkExtor; use Data::Dumper; use strict; use LWP::UserAgent; use HTML::Parser; my ($lines); my ($outfiles); my ($outfiles2); my (@words); my ($title); my ($year); my ($journals); my ($page); my ($i); my ($j); my ($k); my ($au_count); my ($jou_count); my (@authors); my ($authors); my (@journals); my ($element); my ($line_num); my ($surname); my ($initials); my ($outputstring); $outfiles="mark_out.txt"; $outfiles2="mark_out2.txt"; my $ua = new LWP::UserAgent; $ua->timeout(120); $ua->env_proxy; $ua->proxy(['http', 'ftp', 'wais'], 'http://proxy.leeds.ac.uk:3128'); #Specify your url here my $url='http://dis.shef.ac.uk/mark/publications/'; my $request = new HTTP::Request('GET', $url); my $response = $ua->request($request); #Extract content of html page my $content = $response->content(); #print $content; $content =~ s/<.+?>//g; #print $content; # Extract html links my $parser = HTML::LinkExtor->new(); #create LinkExtor object with no callbacks $parser->parse($content); #parse content my @links = $parser->links; #get list of links #print Dumper \@links; #print list of links out. # Open a file to write the text extracted from the html page open (OUTFILE, ">$outfiles") || die "Can't open $outfiles\n"; print OUTFILE $content; close OUTFILE; open (OUTFILE2, ">$outfiles2") || die "Can't open $outfiles2\n"; open (FILE, "<$outfiles") || die "Can't open $outfiles\n"; # Read the file containg the content of html $line_num=0; while(){ $line_num++; my($lines) = $_; chomp($lines); next unless ($. >76); #skip the first 76 lines @words = split(/,/); #split each line separated by a comma, first token is title and the last is year #what if there comma in title? #check that year has 4 digits #$year=@words[-1]; $lines=~ m/(\d{4})/; $year=$1; @words[0] =~ s/\s+$// ; $title=@words[0]; $jou_count=0; $au_count=0; #Empty array splice (@authors); for ($i=1; $i<@words; $i++) { #Split Authors names into surname, intials if(@words[$i]=~ /[A-Z]\s*\.\s*[A-Z]\s*/){ #Store string before the . ($1)as initials and the string after . ($2) in surname #reverses the two strings #Match accented characters and - in names(includes ^ matches all chars. $words[$i]=~s/^(.*)\s+([A-Za-z\-]{2,})/$2 $1/; #$words[$i]=~s/^(.*)\s+([\w\-]{2,})/$2 $1/; $surname=$2; $initials=$1; $outputstring = $surname.','.$initials.'//'; $authors[$au_count]= $outputstring; # print "$outputstring"; $au_count++; } elsif(@words[$i]=~ /[A-Z]\s*|[0-9]\s*/){ @journals[$jou_count]=@words[$i]; $jou_count++; } } #End of for loop #remove last // from author list if (@authors) { #print "\n\nBEFORE\n",Dumper(\@authors); #check @authors is not empty $authors[($au_count-1)] =~ s!//$!!; #Remove last // from list of names #print "\n\nAFTER\n",Dumper(\@authors); print "@authors"; # print OUTFILE2 $authors; } print OUTFILE2"\t$year\t"; print OUTFILE2"$title\t"; print "\t$year\t$title\t"; #chomp $year; for($j=0; $j<$jou_count; $j++) { print OUTFILE2"@journals[$j]\n"; print "@journals[$j]\n"; next; } print OUTFILE2"\n"; } close FILE; close OUTFILE2;