|
看到有人用python写了一个,作为perl的用户,我们也不能落后。
把下面的代码存成baidu.pl,在同样的目录下面建一个mp3子目录。
chmod +x baidu.pl
./baidu.pl
这个程序会crawl baidu top500 mp3,然后自动下载mp3格式大于4M的音乐。Enjoy:2cool
注: 需要安装以下perl module
sudo apt-get install liburi-perl
sudo apt-get install libwww-perl
TIPs: 很短时间写的,暂时界面比较混乱,如果要监视下载进程可以
cd mp3
watch -d ls -lth
- #!/usr/bin/perl -w
- use strict;
- use URI;
- use LWP::UserAgent;
- use HTTP::Response;
- use URI::Heuristic;
- use File::stat;
- use Symbol;
- use POSIX;
- my $PREFORK = 5; # number of children to maintain
- my %children = ( ); # keys are current child process IDs
- my $children = 0; # current number of children
- sub REAPER {
- $SIG{CHLD} = \&REAPER;
- my $pid = wait;
- $children --;
- delete $children{$pid};
- }
- sub HUNTSMAN {
- local($SIG{CHLD}) = 'IGNORE';
- kill 'INT' => keys %children;
- exit;
- }
- # explore the initial link
- my $url = URI->new('http://list.mp3.baidu.com/topso/mp3topsong.html');
- $url->query_form(
- 'id' => '1#top2',
- );
- my $ua = LWP::UserAgent->new();
- my $response = $ua->get($url);
- my @content;
- if ($response->is_error()) {
- printf " %s\n", $response->status_line;
- exit;
- } else {
- @content = split("\n", $response->content());
- }
- my @raw_index = grep /http:\/\/mp3\.baidu\.com\/m\?tn=baidump3/i, @content;
- my @url_l1;
- my @song;
- for (my $i=0; $i<@raw_index; $i++) {
- if ($raw_index[$i] =~ /<a href="(.+)(\+(.+))+" target=_blank>(.+)<\/a>/) {
- push @url_l1, $1.$2;
- push @song, $4;
- }
- }
- my @url_rev = reverse @url_l1;
- my @song_rev = reverse @song;
- # Fork off our children.
- for (1 .. $PREFORK) {
- make_new_child( pop(@url_rev), pop(@song_rev) );
- }
- # Install signal handlers.
- $SIG{CHLD} = \&REAPER;
- $SIG{INT} = \&HUNTSMAN;
- # And maintain the population.
- while (1) {
- sleep;
- for (my $i=$children; $i<$PREFORK; $i++) {
- if (@url_l1 < 1) {
- # We are done
- exit;
- }
- make_new_child( pop(@url_rev), pop(@song_rev) );
- }
- }
- sub make_new_child {
- my ($url_proc, $song_name) = @_;
- my $pid;
- my $sigset;
- # block signal for fork
- $sigset = POSIX::SigSet->new(SIGINT);
- sigprocmask(SIG_BLOCK, $sigset)
- or die "Can't block SIGINT for fork: $!\n";
- die "fork: $!" unless defined ($pid = fork);
- if ($pid) {
- # Parent records the child's birth and returns.
- sigprocmask(SIG_UNBLOCK, $sigset)
- or die "Can't unblock SIGINT for fork: $!\n";
- $children{$pid} = 1;
- $children++;
- return;
- } else {
- # Child can *not* return from this subroutine.
- $SIG{INT} = 'DEFAULT'; # make SIGINT kill us as it did before
- # unblock signals
- sigprocmask(SIG_UNBLOCK, $sigset)
- or die "Can't unblock SIGINT for fork: $!\n";
- # explore the baby
- $response = $ua->get($url_proc);
- if ($response->is_error()) {
- printf " %s\n", $response->status_line;
- exit;
- } else {
- @content = split("\n", $response->content());
- }
- @raw_index = grep /<a target="_blank"/i, @content;
- my @size_index = grep /M<\/td>|<td>未知<\/td>/, @content;
- for (my $i=0; $i<@raw_index; $i++) {
- my @tmp = split (',', $raw_index[$i]);
- # filter file type other than .mp3
- next if ($tmp[2] !~ /(.+)\.mp3$/i);
- # filter file size less than 3 M
- next if ($size_index[$i] !~ /<td>(\d+)\.(\d+) M<\/td>/);
- if ($1 >= 5) {
- $raw_index[$i] =~ /href="(.+)" onclick=/i;
- $response = $ua->get($1);
- if ($response->is_error()) {
- #printf " %s\n", $response->status_line;
- next;
- } else {
- @content = split("\n", $response->content());
- }
- my @link_index = grep /"http:\/\/(.+)\.mp3"/i, @content;
- next if (@link_index < 1);
- next if ($link_index[0] !~ /href="http:\/\/(.+)\/(.+)\.mp3">/i);
- print "Downloading http:\/\/$1\/$2\.mp3 \t $song_name\n";
- sleep(rand(30));
- $song_name = time();
- my @args = ("wget", "-T 300", "http://$1/$2.mp3", "-O./mp3/$song_name.mp3");
- system(@args) == 0 or next;
- if (stat("./mp3/$song_name.mp3")->size < 4000000) {
- system("rm", "-f", "./mp3/$song_name.mp3");
- next;
- }
- # TODO maybe we can choose the largest file
- last;
- }
- }
- # tidy up gracefully and finish
- exit;
- }
- }
- exit;
复制代码 |
|