#!/usr/local/bin/perl
#	ꤷURLʲΥեƵŪ˼äƤץ
#	wwgetall
#		by Yoshioka Tsuneo(ke3057yt@ex.ecip.osaka-u.ac.jp)
#
#	ԡ¤ۤʤɼͳǤ
#
#ΥץưˤϼΥե뤬ɬפǤ
#	perl version5ʹ
#	wwget
#	client.pl
#
$prog=__FILE__;
$progdir=$prog;
$progdir=~s|/[^/]*$||;

#ʲιԤŬ˽Ƥ
#	wwgetΥѥ̾(Хѥ?)
$gethtml_cmd="$progdir/wwget";
#	ץꤷʤȤȤäƤեγĥ(ɽ)
#$default_permited_ext="(\.htm|\.html)";
$default_permited_ext="";
# 	ǥեȤǼäƤե֤
$default_write_dir="$ENV{'HOME'}/html/";
#  HTMLեǤ
$html_ext="(\.htm|\.html|\.HTML|\.HTM)";
$image_ext="(\.gif|\.jpeg)";

$md5_cmd = 'md5';
$MD5_CHECK = 1;

# ؿ
########################################
#ؿ̾	:
#ǽ	:
#	:
#֤	:
#		:


###############################
#ؿ̾	:clean_path
#ǽ	:ѥ̾.ס..פʤɤޤ
#	:$_[0]:ѥ̾
#֤	:줿ѥ
#		: xxx/./yyy/../zz -->xxx/zz
sub clean_path
{
	local($_)=@_;
	local($1,$2);
        s#/\.(/|$)#$1#g;

        $1="";$2="";
        #  \.\. match the most left .. in $_
        while(m|^(.*)\.\.(/.*)?$| ){  
                if($1 eq '/'){
                        $_ = '/' . $2;
                }else{
                        s|/[^\./]+/\.\.||;
                        if($_ eq ""){$_="/"};
                }
                $1="";$2="";
        }
        $_;
}

########################################
#ؿ̾	:parse_path
#ǽ	:(Хѥդ)ե̾ʬ򤹤롣
#	:$_[0]:(Хѥդ)ե̾
#֤	:(ѥ̾,ե̾)
#		:/xxx/yyy/zzz/index.html#label ->( /xxx/yyy/zzz/ , index.html,#label)
sub parse_path
{
	local($pathfile)=@_;
	local($path,$file,$label);
	$pathfile=&clean_path($pathfile);
	
	$pathfile=~m|^(.*)/([^/]*)$|;
	$path=$1."/";$file=$2;
	if($file eq ""){$file="index.html";}
#	print "&parse_path ... path=$path,file=$file\n";
	if($file =~ s/^([^#]*)(#.*)/\1/){
		$label=$2;
	}
	return($path,$file,$label);
}

########################################
#ؿ̾	:parse_url
#ǽ	:URLʬ򤹤롣
#	:$_[0]:URL
#֤	:(ץȥ̾,ۥ̾,ݡֹ,ѥ̾,ե̾)
#		:http://www.osaka-u.ac.jp/xxx/index.html
# 			 --> (http,www.osaka-u.ac.jp,80,/xxx,index.html)
sub parse_url
{
	local($url,$current_url,$recursive)=@_;
	
	# print "parse_url begin...url=<$url>,current_url=<$current_url>,recursive=<$recursive>\n";
	local($_)=$url;
	local($current_proto,$current_host,$current_port,$current_path,$current_file);
	local($proto,$host,$port,$path,$file,$label);
	
	if(m/^(mailto|news):(.*)$/){
		$proto = $1;
		$file = $2;
		return($proto,$host,$port,$path,$file,$label);
	}
	
	if(! m|^http://|){
		if($recursive==1){return ();}
		($current_proto,$current_host,$current_port,$current_path,$current_file)
			=&parse_url($current_url,'',1);
				#recursive call($current_url= http://...)
	}
	
	local($absolute_path); #Хѥ
	$_=$url;
	#print "url is $url\n";
	s/\s//g;
	if(m|^(\S+)\://([^/:]+)(:(\d+))?(/.*)$|i){
		#print "aaaaaaaaaaa\n";
		($proto=$1) =~ y/A-Z/a-z/;
		$host=$2;
		$absolute_path=$5;
		#($path,$file)=&parse_path($5);
		if($3 eq ""){
			if($proto eq "http"){
				$port=80;
			}else{
				$port=((getservbyname($proto,"tcp"))[2]);
			}
		}else{$port=$4;}
		#print "iiipath=$path\n";
	}elsif(m|^(/.*)$|){
		$proto="http";$host=$current_host;$port=$current_port;
		$absolute_path=$1;
		#($path,$file)=&parse_path($1);
		#print "parse_url case 2...path=<$path>,file=<$file>\n";
	}else{
		# Хѥǽ񤫤Ƥ
		$proto="http";$host=$current_host;$port=$current_port;
		$absolute_path=$current_path . $_;
		#($path,$file)=&parse_path($current_path . $_) ;
		#print "parse_url case 3...path=<$path>,file=<$file>\n";
	}
	($path,$file,$label)=&parse_path($absolute_path) ;
	$host =~ y/A-Z/a-z/;
	#print "&parse_url end ...proto=$proto,host=$host,port=$port,path=$path,file=$file\n";
	return($proto,$host,$port,$path,$file,$label);
}

#ƵŪ˥ǥ쥯ȥ
sub rmakedir
{
	local($_)=@_;
	local($dir)="";
	# while(s|^(/[^/]+)(/.*)$|$2|){
	while(s|^(.[^/]*)(/.*)$|$2|){
		$dir=$dir.$1;
		#print "make directory <$dir>\n";
		mkdir($dir,0777);
	}
	$dir=$dir.$_;
		#print "make directory <$dir>\n";
	mkdir($dir,0777);
	return $dir;
}

#URL
sub make_url
{
	local($proto,$host,$port,$path,$file,$label)=@_;
	local($url,$port2);
	if($port == 80){$port2="";}else{$port2=":$port";}
	$url="$proto://$host$port2$path$file$label";
}


# get relative path FROM $current_url to $next_url
# $_[0]($current_url)$_[1]($next_url)ؤХѥ
sub get_relative_path
{
	local($current_url,$next_url)=@_;
	local($dum,$c_host,$dum,$c_path,$c_file)=&parse_url($current_url);
	local($dum,$n_host,$dum,$n_path,$n_file,$n_label)=&parse_url($next_url);
	local($c_path2)="/$c_host$c_path$c_file";
	local($n_path2)="/$n_host$n_path$n_file$n_label";
	#print "c_path2=$c_path2,n_path2=$n_path2\n";
	local($r_path);
	$c_path2=~s|/[^/]+$|/|;
	while(substr($n_path2,0,length($c_path2))  ne $c_path2){
		$c_path2 =~ s|[^/]+/$||;
		$r_path.="../"
	}
	$r_path.=substr($n_path2,length($c_path2));
	# print "[get_relative_path]r_path=<$r_path>\n";
	return $r_path;
}

# (HTMLʳ)ե򥵡СȤäƤ
sub getfile
{
    local($url)=@_;
    local($proto,$host,$dum,$path,$file)=&parse_url($current_url);
    local($_);
	local($size)=0;
    open(FH_W,">$write_dir$host$path$file")||warn"$!: can't write to $write_dir$host$path$file";
    open(FH_R,"$gethtml_cmd GET $current_url -t$time_limit -s$size_limit|")||die "$!: can't pipe to $gethtml_cmd";
    while(read(FH_R,$_,1024)>0){
		$size+=1;
		print FH_W $_;
    }
    print "[getfile]got($url)($size kb)\n";
    close(FH_R);
    close(FH_W);
}

# get URL under $current_url
#	URL$_[0]($current_url)ʲΥեƵŪ˼äƤ롣
sub getallhtml
{
	local($current_url,$level)=@_;
	local($count)=0;local(@url_list);
	
	
	print "[getallhtml]current_url=<$current_url>\n";
	local($proto,$host,$dum,$path,$file,$label)=&parse_url($current_url);

	# if($file =~ /(\.gif|\.lzh|\.gz)$/i){return;}
	# print "------file is $file,permited_ext is $permited_ext---";
	
	# ĥҤäƤ뤫å
	if($file !~ /${permited_ext}$/i){return;}
	
	# ˼äƤե뤫å	
	if($dir_stack{"$host$path$file"}==1){return;}
	$dir_stack{"$host$path$file"}=1;
	if(-e "$write_dir$host$path$file"){return;}
	
			# print "[making directory]$write_dir$host$path\n";
	&rmakedir("$write_dir$host$path");	
	
	#HTMLեǤʤñ˥եäƤ
	if($file !~ /(\.html|\.htm|\.HTML|\.HTM)$/i){
		&getfile("$current_url");
	    return;
	}
	local($write_file_name) = "$write_dir$host$path$file";
	open(FH_W,">$write_file_name")||warn "$!: can't open $write_dir$host$path$file";
	print "[getallhtml]($level)***GET***begin($current_url)->($write_dir$host$path$file)\n";
	
	
	{
		local($cmd)="$gethtml_cmd GET $current_url -t$time_limit -s$size_limit";
		print "executing ($cmd)\n";
		open(FH_R,"$cmd|")||die "$!: can't pipe to $gethtml_cmd";
	}	
		local($line)=0;
	
	while(<FH_R>){
		$line+=1;
		$*=0;
	#print "AAA\n";
		while(/<[^\>]*$/){
	#print "III\n";
		# <>פбƤʤ2Ԥˤ錄äĴ٤롣
			#s/\r$//;s/\n$//;
			$line+=1;
			local($newline);
			$newline = <FH_R>;
			#print "newline = [$newline]\n";
	                if($newline eq ""){last;}
			$_.=$newline;
			#$*=1;
			#print "current = [$_]\n";
			#print "match = [",/<[^\>]*$/,"]\n";
		}
	#print "UUU\n";
		#	$*=0;

		#while(/^(.*)<\s*a\s+href\s*=\s*\"?([^ \t">]+)"?\s*>(.*\n?)$/i
		#	|| /^(.*)<\s*img\s+src\s*=\s*\"?([^ \t">]+)"?\s?[^>]*>(.*\n?)$/i){
		while(/<\s*(a|img)([^>]+)(href|src)\s*=\s*\"?([^ \t">]+)"?([^>]*)>/i){
			local($pre,$post)=($`,$');
			local($pre2,$post2)=($2,$5);
			local($tag,$option)=($1,$3);
			# local($proto,$host,$port,$path,$file);
			local($url)=$4;
			
			print "url=$url\n";
			local($proto,$host,$port,$path,$file,$label)=&parse_url($url,$current_url);
			#print "($proto,$host,$port,$path,$file)\n";
			last if($path =~ m|//|);
			last if($proto eq "mailto");
			last if($label ne "");
			local($url2)=&make_url($proto,$host,$port,$path,$file,$label);
			
                        # "$proto://$host$path$file";
			#print "&getallhtml url2=<$url2>\n";
			#if($url2 =~ /^$match_url/ 
			#		&& $dir_stack{"$host$path$file"}!=1){
			###############################################
			# local($tmp_url)="$proto://$host$path$file";
			#print "url2=<$url2>,match_url=<$match_url>\n";
			# print "dir_stack{$host$path$file}=$dir_stack{$host$path$file}\n";
			#if($url2 =~ /^$match_url/ 
			#		&& $dir_stack{"$host$path$file"}!=1){
			# ꤵ줿ɽ˰פ뤫ե(ĥҤgifʤ)
			# ʤмäƤ褦
			if( ($url2 =~ /^${match_url}/) || ($url2 =~ /${image_ext}$/i)){
				$url_list[$count]=$url2;
				print "[getallhtml]found URL($url2)\n";
				$count++;
			}
			$path=&get_relative_path($current_url,$url2);
				# $path="$write_dir$path";
				# print FH_W $pre,"<a href=\"file:$path\">";

			# print FH_W $pre,"<a href=\"$path\">";
			print FH_W $pre,"<${tag}${pre2}${option}=\"$path\"$post2>";
			################################################
			
			#}else{
			#	print FH_W $pre,"<a href=\"$url\">";
			#}
			
			$_=$post;
		}
		print FH_W $_;	
	}
	print "[getallhtml]($level)***GET***end($current_url)->($write_dir$host$path$file($line lines)\n";

	close(FH_W);close(FH_R);
	if($MD5_CHECK){
		local($md5_ret) = `$md5_cmd $write_file_name`;
		if($md5_ret =~ /=\s*(\S+)/){
			$md5_ret = $1;
			print "md5 = [$md5_ret]\n";
			if($md5_set{$md5_ret}){
				print "Already Found the same file . original file = [$md5_set{$md5_ret}],file = [$write_file_name], md5=[$md5_ret]\n";
				return;	
			}
			$md5_set{$md5_ret} = $write_file_name;
		}else{
			print "Can't exec md5 command[$md5_cmd]\n";
		}
	}
	# print "[getting URL end]<$current_url>\n";
	foreach(@url_list){
		# print "urllist is <$_>\n";
		&getallhtml($_,$level+1);
	}
}

# ñʻˡɽ
sub usage
{
	local($oldfh)=select(STDERR);
	local($prog)=`basename $0`;chop($prog);
	print "usage: $prog [-m<match_url>] [-d<write_dir>] [-t<time_limit>] [-s<size_limit>] [-e<permited_extension>] <http://hostname/path>\n";
}

if($0 eq __FILE__){
# $permited_ext="(\.html|\.htm)";
    $permited_ext=$default_permited_ext;
    while($_ = $ARGV[0]){
	if(/^-m(.+)$/){
	    $match_url=$1;
	}elsif(/^-d(.+)$/){
	    $write_dir=$1;
	}elsif(/^-t(\d+)/){
	    $time_limit=$1;
	}elsif(/^-s(\d+)/){
	    $size_limit=$1;
	}elsif(/^-e(.*)$/){
	    $permited_ext=$1;
	}elsif(/^-pack$/){
	    # ץ
	    print "packing wwgetall\n";
	    
	    system('rm -rf ${HOME}/public_html/soft/wwgetall');
	    system('mkdir ${HOME}/public_html/soft/wwgetall');
	    
	    system('cp ${HOME}/bin/client.pl ${HOME}/bin/wwget ${HOME}/bin/wwgetall ${HOME}/bin/bin_wwgetall ${HOME}/public_html/soft/wwgetall');
	    system('cd ${HOME}/public_html/soft/wwgetall;lha a wwgetall.lzh *');
	    exit 1;
	}else{
	    $start_url=$_;
	}     
	shift;		        
    }
#if($#ARGV != 0){
#	&usage;
#	$match_url="http://yoshioka.yoshioka.toyonaka.osaka.jp/";
#	exit;
#}else{
#	$match_url=$ARGV[0];
#}
    if($start_url eq  ""){
	&usage;		    
#	$start_url="http://yoshioka.yoshioka.toyonaka.osaka.jp/";
	exit 1;
    }

    if($start_url !~ /^http/i){
	$start_url="http://$start_url";
    }

    if($start_url =~ m|^http://[^/]*$|i){
	$start_url.="/";
    }

    print "start url is<$start_url>\n";


    if($start_url !~ m|^http\://([^/:]+)(:(\d+))?(/.*)$|i){
	&usage;
	exit 1;
    }
    if($match_url eq ""){
	$match_url=$start_url;
	if($match_url =~ m/${html_ext}$/){
        	$match_url =~ s|[^/]*$||;
	}
    }
    $match_url =~ s|:80/|/|;
    if($write_dir eq ""){
	$write_dir=$default_write_dir;
    }
    if($write_dir !~ m|/$|){
	$write_dir.="/";
    }
    if($time_limit eq ""){
	#$time_limit=20;
	$time_limit=99999;
    }
    if($size_limit eq ""){
	#$size_limit=50;
	$size_limit=99999;
    }
#if($permited_ext eq ""){
#    $permited_ext="(\.html|\.htm)";
#}
    mkdir($write_dir,0777);
    print "write_dir=<$write_dir>\n";
    print "match_url=<$match_url>\n";
    print "start_url=<$start_url>\n";

    &getallhtml($start_url,0);
}










