Subject: | fixed method for comments, added new method |
Hi,
sorry to "spam" the bug reports but seems the easiest way to contact.
I've fixed the methods that retrieve the comments from a profile and I
also added a function that gets some basic info from the profile (age,
gender, country, etc). I attach the code in the file for you to look at.
If you want to incorporate it, I could probably check it into your CVS
(given a login) and then you can adapt it as I'm sure there is room for
improvement.
best, tobias
Subject: | code.txt |
#---------------------------------------------------------------------
# get_basic_info_on_page( $page );
# This routine takes the SOURCE CODE of the page and returns
# a hash of different information contained in the box on the
# top left corner
=head2 get_basic_info_on_page( $friends_page );
This routine takes the SOURCE CODE of an HTML page and returns
a hash of information containing:
country - country in profile (names of countries are is standardised on MySpace)
cityregion - the line with city and region information (this is free text)
headline - what ever it says as next to the picture
age - as number
gender - as text, either male or female
lastlogin - date of last login
Note: MySpace joins the profile data from city and region to one line (such as Berlin, Germany).
However, both city and region are free text so people can write whatever they want. What is more,
region is optional. This function tries to extract the city and the region by splitting cityregion
at the last comma. However, it might not work (depending on the profile information) so both city
and region can be empty.
city - city
region - region
=cut
sub get_basic_info_on_page {
my ( $page ) = @_;
##THIS IS LANGUAGE DEPENDENT SO SITE HAS TO BE ACCESSED IN ENGLISH!!!
#my $BASIC_INFO = 'Table2".*?>.*<td.*?>(.*?)Last Login';
my $BASIC_INFO = 'Table2".*?>(.*Last Login:.*?)<br>';
$BASIC_INFO = qr/$BASIC_INFO/smo;
#my $time=time;
#matching does take quite long... (around 6s)
$page =~ /$BASIC_INFO/;
#$page =~ /Table2.*?>.*<td.*?>(.*?)Last Login/smo;
$page=$1;
$page =~ /align="left">(.*)/smo;
#print "took time:",time-$time,", found $1\n";
( $DEBUG ) && print $1,"\n";
my %info = ();
#assign values and trim leading and trailing white spaces
($info{'headline'},$info{'empty'},$info{'gender'},$info{'age'},$info{'cityregion'},$info{'country'},$info{'empty'},$info{'empty'},$info{'lastlogin'})=map {s/^\s+//;s/\s+$//;$_} split('<br>',$1);
#return age as number only
$info{'age'} =~ s/^(\d+).*/$1/;
#return last login as date only
$info{'lastlogin'} =~ s/Last Login:\s+([\d\/]*)/$1/;
#let's guess what is the city and what the region
if ($info{'cityregion'} =~ /(.*), (.*)/){
$info{'city'} = $1;
$info{'region'} = $2;
}
return (%info);
}
#########################################
# fixed comments method
#######################################
sub tobiesch_get_comments {
my ( $friend_id ) = @_;
my @comments = ();
my $url="http://comment.myspace.com/index.cfm?fuseaction=user.viewComments&friendID=". $friend_id;
my $eventtarget='ctl00$Main$PagedComments$pagingNavigation1$NextLinkButton';
my $page="";
my $commentcount;
$self->_die_unless_logged_in( 'get_comments' );
#only get a maximum of 20 comment pages
#this should translate to 1000 comments
#and also serves as a safety measure in case
#the method breaks again
( $DEBUG ) && print "Getting $url\n";
$page = $self->get_page( $url );
#find out how many comments in total
#STILL NEEDS UPDATE IN CASE OF MORE THAN 1000 COMMENTS...
#ALSO SEEMS LIKE SOMETIMES MYSPACE REPORTS MORE COMMENTS (OR FRIENDS)
#THAN ARE ACTUALLY THERE...
if($page->decoded_content =~ /.*Listing [\d-]+ of (\d+).*/smo){
$commentcount=$1;
}else{
$self->error("Could not find how many comments are on profile");
return undef;
}
for(my $i=1;$i<=20;$i++) {
$page=$self->{current_page};
push @comments, $self->tobiesch_get_comments_from_page( $page->decoded_content );
#make sure we did not get an error
return undef if ($self->error);
last unless ( $self->_next_button( $page->decoded_content ) );
( $DEBUG ) && print "try to submit form to access comments page #",$i+1,"\n";
#submit the form to get to next page
$self->submit_form({
follow => 0,
form_name => "aspnetForm",
no_click => 1,
fields_ref => { __EVENTTARGET => $eventtarget, __EVENTARGUMENT => '' }
#re1 => 'something unique.?about this[ \t\n]+page',
});
sleep ( int( rand( 2 ) ) + 1 );
}
unless(scalar (@comments) == $commentcount){
$self->error("Could not collect all comments. Have " . @comments .", should have $commentcount");
return undef;
}
return \@comments;
}
# Take a page, return a list of comment data
sub tobiesch_get_comments_from_page {
my ( $page ) = @_;
my @comments = ();
# Get to the comments section to avoid mis-reads
if ( $page !~ m/Add Comment<\/a>/gs ) {
$self->error("Comment section not found on page");
return undef;
}
# Read the comment data and push it into our array.
while ( $page =~ s/.*?UserID=([0-9]+).*?<h4>(.*?)<\/h4>\s*(.*?)\s*<\/textarea>//smo ) {
push @comments, { sender => $1, date => $2, comment => $3 };
#print "found 1:$1\nfound 2:$2\nfound 3:$3\n";
}
return @comments;
}
################################################