#! /usr/bin/perl
#use utf8;
use XML::Parser::Lite;
use Socket;
use Encode qw(:all);
#use threads;
#use threads::shared;
$ttl=5; # minutes
$url='http://news.eu.by/';
$subject='Belarus';
$root=substr($0,0,rindex($0,'/'));
$web="$root/html";
$pub="$web/dir";
$temp="$root/tmp";
$xmlname="newz";
$deltaname="newz";
$htmlname="newz";
#my $enc='.gz';
$enc='';
$title='Breaking News!';
$rotate_time=5400; # sec;
#$proxy='195.50.2.154:8080';
@or=('Belarus','Belorussia','Byelorussia','Belarussian','Byelorussian','Bielorussia','Bielorusso','Bielorussa','Belarusse');
@or_ru=('%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D1%8C','%D0%91%D0%B5%D0%BB%D0%BE%D1%80%D1%83cc%D0%B8%D1%8F','%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%B8','%D0%91%D0%B5%D0%BB%D0%BE%D1%80%D1%83%D1%81%D1%81%D0%B8%D0%B8','%D0%91%D0%B5%D0%BB%D0%BE%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8%D0%B9','%D0%91%D0%B5%D0%BB%D0%BE%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B0%D1%8F','%D0%91%D0%B5%D0%BB%D0%BE%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%BE%D0%B5','%D0%91%D0%B5%D0%BB%D0%BE%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8%D0%B5','%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%BA%D0%B8%D0%B9','%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%BA%D0%B0%D1%8F','%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%BA%D0%BE%D0%B5','%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%BA%D0%B8%D0%B5','%D0%91%D0%B5%D0%BB%D0%BE%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8%D1%85','%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%BA%D0%B8%D1%85&btnG=%D0%9F%D0%BE%D0%B8%D1%81%D0%BA','%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B5%D0%B9');
my $encoding='windows-1251';
my $arc="7z";
my $arc_month=-1;
my @month=('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');
$|=1;
alarm($ttl*60-30);
my %arcs=(
"tar.bz2"=>sub{exec("tar -cjf $_[0] --remove-files $_[1]")},
"tar.gz"=>sub{exec("tar -czf $_[0] --remove-files $_[1]")},
"rar"=>sub{exec("rar m -m5 -ds -md4096 -s -inull $_[0] $_[1]")},
"7z"=>sub{
(system("7z a -t7z -m0=lzma -mx=9 -mfb=64 -md=64m -ms=on -bd $_[0] $_[1]")==0)||return 0;
unlink <$_[1]> if($? == 0 &&-e $_[0])
}
);
&install if($ARGV[0] eq 'install');
do "$root/fin";
my $fail=0;
my $heads=qq(
$title
);
my $ad1=q(
);
my $ad2=q(
);
my %cont:shared=(
'root.eu.by'=>'Dzianis Kahanovich',
'www.bspu.unibel.by'=>'Belarussian State Pedagogical University',
'www.belta.by'=>'BELTA',
'news.google.com'=>'Google News',
'finance.google.com'=>'Google Finance',
'www.idealist.org'=>'Idealist.org'
);
my $or_='+OR+';
#my $or_='+%7C+';
my %all;
my %goo=(
''=>{q=>join($or_,qqq(@or))},
'de'=>{ned=>'tde',q=>join($or_,qqq('Wei%C3%9Frussland','Belarus','Belorussland','Wei%C3%9Fru%C3%9Fland','Beloru%C3%9Fland','wei%C3%9Frussisch','belarussich','wei%C3%9Fruthenisch','Belarusse','Wei%C3%9Frusse','Belarussin','Wei%C3%9Frussin','Wei%C3%9Frussische'))},
'nl_nl'=>{ned=>'tnl_nl',q=>join($or_,qqq('Wit-Rusland','Wit-Russisch','Wit-Rus','Wit-Russin','Wit-Russische','Bjelo-Rusland','Bjelorussisch','Bjelorussische','Bjelorus','Belarus'))},
'fr'=>{ned=>'tfr',q=>join($or_,qqq('Belarus','Bi%C3%A9lorussie','Bi%C3%A9lorusse','Bielaroussy','Bielarouss','Bi%C3%A9larussie','Bi%C3%A9larusse'))},
'es'=>{ned=>'tes',q=>join($or_,qqq('Bielorrusia','Belar%C3%BAs','%22Rusia+Blanca%22','bielorruso','bielorrusa','Belarus'))},
'pt-PT_pt'=>{ned=>'tpt-PT_pt',q=>join($or_,qqq('Bielo-R%C3%BAssia','Bielor%C3%BAssia','Bielorr%C3%BAssia','bielorrusso','bielorrussa','Belarus'))},
'ru'=>{ned=>'tru_ru',q=>join($or_,qqq(@or_ru))}
);
#my $ya=join('%7C',qqq('%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%EE%F0%F3%F1%F1%EA%E8%E9'));
my $ya=join('%7C',qqq(@or_ru));
my @lang=('ru','en','es','fr','de','nl_nl','it','pt-PT_pt','pt-BR_br');
my %lang_web=('en'=>2,'ru'=>1,'fr'=>2,'de'=>2,'nl_nl'=>2,'it'=>2,'pt-PT_pt'=>2,'pt-BR_br'=>2,'es'=>2);
my $results=0;
sub mon{
for(0..$#month){
if(index($_[0],$month[$_])>=0){
$_+=$_[1];
$_+=12 while($_<0);
return $month[$_]
}
}
}
sub arc_exit{
chdir($pub)||die $!;
while(<$xmlname.*.xml>){
my $m=mon($_);
my $time=gmtime;
for my $mm($arc_month..0){goto NN if($m eq mon($time,$mm))}
my $x=$_;
my $y="!";
$x=~s/(\d{4})/$y=$1/ex;
my $f="*$m*$y*";
my $fa="$xmlname.$y.$m.$arc";
if(!-e $fa){
&{$arcs{$arc}}($fa,$f);
exit
}
NN:
}
}
sub fn{
substr($_[1],-3) eq '.gz'?($_[0]?"|gzip -9 >$_[1]":"gzip -dc <$_[1]|"):($_[0]?">$_[1]":"<$_[1]");
}
sub qqq{
for(@_){$all{$_}=1}
@_
}
sub add_cont{
my $l=$_[0].'/';
my $x;
$l=~s/http\:\/\/(.*?)[\:\/]/$x=$1;''/gsei;
$cont{$x}||=$_[1];
}
sub cur_cont{
my ($i,$r);
for(sort keys %cont){
$i=unesc($_);
$r.="
".url("http://$i",$cont{$_}) if(index($_[0],$_)>=0 || index($_[0],$i)>=0 || $_ ne $cont{$_});
}
$r
}
sub esc{
my $x=shift;
$x=~s/([\x00-\x1f,:\"\'\\\/])/sprintf('%%%02X',ord($1))/eg;
$x;
}
sub unesc{
my $x=shift;
local $1;
$x=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$x;
}
sub qm{
#quotemeta($_[0])
my $x=shift;
$x=~s/([\'\\])/\\$1/gs;
$x=~s/\r/\\r/gs;
$x=~s/\n/\\n/gs;
$x;
}
sub get_xml{
my ($s,%h,@a,@a1,@ad);
@a=split(/:\/\//,$_[0],2);
unshift @a,'http' if(!defined($a[1]));
@a[1,3]=split(/\//,$a[1],2);
@ad=@a[1,2]=split(/:/,$a[1],2);
@a1=@a;
$ad[1]||=80;
$a1[0]&&="$a[0]://";
$a1[2]&&=":$a[2]";
$a1[3]&&="/$a[3]";
if($proxy){
@ad=split(/:/,$proxy,2);
$a1[3]=join('',@a1);
}
print "+";
socket(SO,PF_INET,SOCK_STREAM,PROTO_TCP)&&
setsockopt(SO,SOL_SOCKET,SO_SNDTIMEO,pack('LL',15,0))&&
setsockopt(SO,SOL_SOCKET,SO_RCVTIMEO,pack('LL',15,0))&&
connect(SO,sockaddr_in($ad[1],inet_aton($ad[0])))&&goto OK;
ERR:
close(SO);
$fail++;
return;
OK:
select(SO);$|=1;select(STDOUT);
print SO qq($_[1] $a1[3] HTTP/1.1
Host: $a1[1]
User-Agent: robot $url
Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5
Accept-Language: ru,be;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding: none
$_[2]Connection: close
)||goto ERR;
while((!eof(SO))&&defined(my $x=)){$s.=$x;$x=~s/[\r\n]*//gs;$x||last}
$s=~s/(.*?): (.*?)[\r]\n/$h{lc($1)}=$2/gise;
undef $s;
while((!eof(SO))&&defined(my $x=)){$s.=$x}
close(SO);
my $s1;
$s=~s/.*(<\?xml.*?\/rss>).*/$s1=$1;''/se;
$s1,%h
}
sub url{
my $u=$_[0];
my $t=$_[1]||unesc($u);
"$t"
}
my (@block,%item,%channel,@items,$cnt0,$cnt0_);
## 'id'=>[start,char,end,start1,char1.end1];
my %blocks=(
'rss.channel'=>[
sub{%channel=()},
undef,
undef,
undef,
sub{shift;$channel{$block[$#block]}=join('',@_)}
],
'rss.channel.item'=>[
sub{%item=()},
undef,
sub{push @items,{%item};undef %item},
undef,
sub{shift;$item{$block[$#block]}=join('',@_)},
undef
],
'rss.channel.item.link'=>[
]
);
#$blocks{'rdf:RDF.item'}=$blocks{'rss.channel.item'};
my %handlers=(
Start=>sub{parser_event(3,@_);push @block,$_[1];unshift @_,0;goto &parser_event},
Char=>sub{parser_event(1,@_);unshift @_,4;goto &parser_event},
End=>sub{parser_event(2,@_);while($_[1] ne pop @block){};unshift @_,5;goto &parser_event},
);
sub parser_event{
my $e=shift;
my $id=join('.',@block[0..$#block-($e>3)]);
#print "$id\n";
if(exists($blocks{$id})){
my $h=$blocks{$id};
goto ref($h)||return;
HASH:return $h->{('Start','Char','End')[$e]}(@_);
ARRAY:return defined(@$h[$e])?&{@$h[$e]}(@_):undef;
SCALAR:return;
}
}
sub get_rss{
my $url=shift;
my $head=shift;
my $retry=2;
RETRY:
my ($x,%h,$ff,$ffb,$t,$p,$lm);
print "get $url\n";
if(substr($url,0,7) eq 'file://'){
$ff=substr($url,7);
return if(!-e $ff);
goto FILE;
}
add_cont($url);
if($head){
($x,%h)=get_xml($url,'HEAD');
return if(!defined(%h));
}
if(exists($h{'last-modified'})){
$ffb="$temp/newz-".esc($url);
stat($ff="$ffb.".esc($h{'last-modified'}));
if(-e _){
FILE:
print "== $ff\n";
open(FF,fn(0,$ff)) or die "$! $ff";
$x='';
while(!eof(FF)&&(my $s=)){$x.=$s}
close(FF);
}else{
($x,%h)=get_xml($url,'GET');
return if(!defined($x));
}
}else{($x,%h)=get_xml($url,'GET');
}
$p=new XML::Parser::Lite;
$p->setHandlers(%handlers);
my ($e,$a);
$x=~s/(<\?xml.*?>)/$a.=$1;$1/es;
$a=~s/ encoding\=\"(.*?)\"/$e=lc($1)/es;
from_to($x,$e,$encoding,HTMLCREF) if($e && $e ne $encoding);
undef %channel;
@items=();
if(index($x,'/rss>')<0){
return if($h{'content-type'} eq 'text/html');
print "\nERROR: $h{'content-type'} $url\n";$retry--?goto RETRY:return
}
my $OLDDIE=$SIG{__DIE__};
$SIG{__DIE__}=sub{unlink($ff)};
$p->parse($x);
$SIG{__DIE__}=$OLDDIE;
## debug:
#$ff=substr(($ffb="$temp/newz-".esc($url)).".".esc($h{'last-modified'}||=gmtime),0,128);
if(exists($h{'last-modified'})){
while(my $d=<$ffb.*>){unlink($d)}
wrf($ff,$x) if($ff);
}
$x=$channel{title};
$x=~s/[\:\n].*//s;
add_cont($channel{link},quotemeta($x)) if($channel{link});
addnews(@_);
1
}
my %htm=(
'lt'=>'<',
'gt'=>'>',
'amp'=>'&',
'quot'=>'"'
);
sub dehtml{
my $s=shift;
$s=~s/\&(.*?)\;/$htm{$1}||"\&$1;"/gse;
$s
}
sub htmlz{
my $s=shift;
for(keys %htm){$s=~s/$htm{$_}/$_/gs}
}
my @news0:shared;
my %news:shared;
my %nh:shared;
my %mm=('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11);
sub nkey{$_[0]->{xlink}||$_[0]->{link}||$_[0]->{description}}
# [param_redirect[,lang[,no_sort_by_time]]]
sub addnews{
my ($l,$x);
my $u=$_[0];
for(@items){
$l=$_->{link};
$_->{lang}||=ref($_[1]) eq 'CODE'?&{$_[1]}($_):$_[1];
if($u){
$x="$l\&";
$x=~s/[\&\?\;]$u\=(.*?)\&/$l=dehtml($1);''/gse;
if($x ne "$l\&"){
$l="http://$l" if(index($l,'://')==-1);
$_->{xlink}=$l;
}
};
$l=nkey($_);
if(exists($nh{$l})){$nh{$l}++}
else{
if($_[2]){
$nh{$l}=0;
push @news0,$_;
}else{
$nh{$l}=1;
my ($t1,$t)=(0,$_->{pubDate});
$t=~s/([0-9]{2})\:([0-9]{2})\:([0-9]{2})/$t1=$3+($2+$1*60)*60;''/e;
$t=~s/([0-9]{1,2}) ([a-zA-Z]{3}) ([0-9]{4})/$t1+=($1+$mm{$2}*31+$3*365)*24*60*60;''/e;
$t=~s/\+0([0-9])00/$t1-=$1*60*60;''/ex;
$t=~s/\-0([0-9])00/$t1+=$1*60*60;''/ex;
$t1++ while(exists($news{$t1}));
$news{$t1}=$_
}
add_cont($l)
}
}
}
sub mv{rename($_[0],$_[1])||`mv -f $_[0] $_[1]`}
my $time=gmtime;
sub time2h{
my $x="$_[0] GMT";
$x=~s/ /\ \;/gs;
$x
}
my ($t0,$time0,$tstamp,$counter,$rotate);
if(open(FT,fn(0,"$pub/time$enc"))){
$t0=;chomp($t0);
$time0=;chomp($time0);
$counter=;chomp($counter);
$tstamp=;chomp($tstamp);
close FT;
print "time: ",time-$t0,"\n";
};
$tstamp||=time;
sub cp{
my $x=quotemeta($_[0]);
my $y=quotemeta($_[1]);
`cp -f $x $y`
}
if($ARGV[0] eq 'test'){$web=$pub=$temp;}
get_rss("file://$pub/$xmlname.xml$enc",0,undef,'ru',1);$cnt0_=$cnt0=$#news0;
if($ARGV[0] ne 'test' && (time-$t0>$rotate_time || $ARGV[0] eq 'rotate')){
cp("$pub/$xmlname.xml$enc","$pub/$xmlname.$time.xml$enc");
cp("$pub/$htmlname.html$enc","$pub/$htmlname.$time.html$enc");
undef $t0;
$rotate=1;
$cnt0=-1;
}
if(!defined($t0)){($t0,$time0,$counter)=(time,$time,0)}
my $hhead=qq(
');c=l=''}
));
wrf1("$web/newz.css",q(body{margin-left:1em;margin-right:1em;text-align:justify;text-indent:1em;margin-top:0px;margin-bottom:0px;}
table{margin-left:0em;margin-right:0em;text-indent:0em;margin-top:3px;margin-bottom:0px;}
li{text-align:justify;text-indent:0em;margin-top:3px;margin-bottom:3px;}
.highlightWhite td { background-color:#FFFFFF; }
.highlightGrey td { background-color:#EFEFEF; }
.highlightGreyRelated td { background-color:#EFEFEF; }
));
symlink("index-j.html","$web/index.html");
exit
}
__END__
License: Anarchy.
Все стихийные (включая социальные (включая юридические, моральные и т.д.))
аспекты существования и использования данного кода являются форс-мажорными
обстоятельствами и автора не интересуют.
Money are welcome.
(c) mahatma, 29.09.2006