scripts/gettd.pl
changeset 29 13a99a13ca90
parent 28 ab800b3e45ab
child 30 9372f863bb3a
equal deleted inserted replaced
28:ab800b3e45ab 29:13a99a13ca90
     1 #!/usr/bin/perl
       
     2 
       
     3  
       
     4 
       
     5 use IO::Socket; 
       
     6 use Getopt::Long;
       
     7 
       
     8 
       
     9 my $target_url; #target url for the roadmap
       
    10 my $tdomain; #tag for the domain to be use in csv file
       
    11 my $csvfile; #output csv file name
       
    12 my $authon= '';	 #does it require authorisation? default is false
       
    13 
       
    14 
       
    15 my $count_target; #this value is used to pass a string to match and count on each package backlog
       
    16 my $ispackage;
       
    17 my $splitbklogs;
       
    18 my $summaryheader="ID\tPackage\tFeatures\tFormat\tHttp\n" ;
       
    19 my $newtdformat = 0;
       
    20 my @blist = ("Tracking_Package_features","Kernel_EPLization_Backlog",
       
    21     "Package_Backlog_Q","Wishlist","Test_Package","Backlog_Details");
       
    22 
       
    23 
       
    24 sub blacklist
       
    25 {
       
    26  ($name)=@_;
       
    27       
       
    28  foreach(@blist) {
       
    29  
       
    30  if ( $name =~ m/$_/sg) { print "WARNING - $name is blacklisted\n"; return 1;}
       
    31  }
       
    32  
       
    33  return 0;
       
    34 
       
    35 }
       
    36 
       
    37 
       
    38 sub getpage
       
    39 {
       
    40 	#arguments
       
    41 	($page,$host,$auth,$myfile)=@_;
       
    42 	
       
    43 	
       
    44 	#output file
       
    45 	open ( outputfile, ">".$myfile);
       
    46 	
       
    47 	
       
    48 	$port = "http(80)";
       
    49 	$getmess = "GET " . $page ." HTTP/1.1\n" . $auth;
       
    50 
       
    51 	print "INFO - sending message - $getmess\n";
       
    52 	print outputfile "$getmess\n\n";
       
    53 
       
    54 	$sock = IO::Socket::INET->new 	
       
    55 		(
       
    56 		 PeerAddr => $host,   PeerPort => $port,  Proto => 'tcp', 
       
    57 		) ;
       
    58 
       
    59  
       
    60 	print $sock "$getmess\n\n";
       
    61 
       
    62  
       
    63 	while(<$sock>) {
       
    64  
       
    65 	  print outputfile $_;
       
    66  
       
    67 	}	
       
    68   	
       
    69 	close ($sock);
       
    70 	close (outputfile);
       
    71 }
       
    72 
       
    73 sub prntfeatures 
       
    74 {
       
    75 
       
    76 	($release,$package,$features,$myfile,$domain)=@_;
       
    77 	
       
    78   $release =~ s/\\//sg;	
       
    79 	$csvdel ="\",\"";
       
    80 	$invcoma ="\"";
       
    81  if ($newtdformat) {
       
    82   $package =~ s/backlog//sgi;
       
    83   print $myfile "$invcoma $release $csvdel $domain $csvdel $package $csvdel $myfeat $invcoma\n";
       
    84   
       
    85  } else {
       
    86 		
       
    87 	$features = $features."<dt";
       
    88 
       
    89 	
       
    90 
       
    91 	while ( $features =~ /dt\>(.*?)\<\/dt(.*?)\<dt/sg  ){
       
    92 		$myfeat = $1;
       
    93 		$subfeat =$2;
       
    94 		
       
    95 		$myfeat =~ s/\n/ /sg;
       
    96 		
       
    97 		pos($features) = pos($features) -2;
       
    98 		
       
    99 		$mystr="";
       
   100 		while ( $subfeat =~ /\<dd\>(.*?)\<\/dd\>/sg) {
       
   101 			$mysubfeat = $mysubfeat.$mystr.$1;
       
   102 			$mystr = " & ";
       
   103 		}
       
   104 		undef $mystr;
       
   105 	$mysubfeat =~ s/,/ /sg;
       
   106 	$mysubfeat =~ s/\n//sg;
       
   107 	$mysubfeat =~ s/\<.*?\>//sg;
       
   108 	
       
   109 
       
   110 	print $myfile "$invcoma $release $csvdel $domain $csvdel $package $csvdel $myfeat $csvdel $csvdel $csvdel $mysubfeat $invcoma\n";
       
   111 	
       
   112 	$mysubfeat = "";	
       
   113 	}
       
   114 		
       
   115  }
       
   116 }
       
   117 	
       
   118 sub loadfile
       
   119 {
       
   120 
       
   121 	$/ = " ";
       
   122 	#arguments
       
   123 	($myfile)=@_;
       
   124 	open ( inputfile, "<".$myfile);
       
   125 	my $contents = do { local $/;  <inputfile> };
       
   126 	close(inputfile);
       
   127 	return $contents;
       
   128 
       
   129 }
       
   130 
       
   131 sub td_roadmap
       
   132 {
       
   133 
       
   134 
       
   135 	#arguments
       
   136 	($infile,$outfile,$domain,@releases)=@_;
       
   137 
       
   138 	
       
   139 	$roadmap=loadfile $infile;
       
   140 	open ( outputfile, ">>".$outfile);
       
   141 
       
   142 
       
   143   if ($newtdformat) {
       
   144        print "Processing new TD roadmap format\n";
       
   145          if ($roadmap =~ m /Contents\<\/h2\>.*?\<\/table/sg) { $roadmap =$';}
       
   146          foreach (@releases) {
       
   147           $exp=$_." Roadmap";
       
   148 		         
       
   149            if ($roadmap =~ m /($exp)/sg) { 
       
   150 			     print "PASS - Found entry for $_ \n";
       
   151 			     $relroad =$';	
       
   152 			
       
   153 			     if ($roadmap =~ m /table\>(.*?)\<\/table/sg) { $relroad =$1;}
       
   154 			      $relroad =~ s/\n//sg;
       
   155 			      
       
   156             $skipfirst =1; #skipping the header of the table         
       
   157             while ($relroad =~ m/\<tr\>(.*?)\<\/tr(.*)/sg){
       
   158                   $myfeat=$1;
       
   159                   $relroad =$2;
       
   160                   if ($skipfirst) {$skipfirst=0;next;}
       
   161                   $package="";
       
   162                   if  ($myfeat =~ m/title\=\"(.*?)\"/sg) {  $package=$1; } #looking for package name
       
   163                                   
       
   164                   $myfeat=~ s/\<\/td\>\<td\>/\",\"/sg;   
       
   165                   $myfeat=~ s/\<.*?\>//sg;
       
   166                  
       
   167                  if ($myfeat =~m/[A-z]/sg ){prntfeatures($_,$package,$myfeat,outputfile,$domain);}
       
   168                 
       
   169                  }  		     
       
   170          }
       
   171         }
       
   172   } else {
       
   173 
       
   174 	 foreach (@releases) {
       
   175 		
       
   176 	 	$exp="\\<h2\\>.*?\\>".$_;
       
   177 		  
       
   178 		if ($roadmap =~ m /($exp)/sg) { 
       
   179 			print "PASS - Found entry for $_ \n";
       
   180 			$relroad =$';	
       
   181 			
       
   182 			if ($relroad =~ m /(.*?)\<h2/sg) { $relroad =$1;}
       
   183 			$i=0;	
       
   184 			while ($relroad=~ m/\<h3\>.*\>(.*?)\<.*<\/h3/g) {
       
   185 				$package = $1;		
       
   186 				$ppos[$i]= pos($relroad);
       
   187 				$pname[$i]= $package;
       
   188 				$i++;
       
   189 			}
       
   190 			for ( $i=0;$i<($#ppos); $i++){
       
   191 				$features= substr ($relroad, $ppos[$i],$ppos[$i+1]-$ppos[$i]);
       
   192 				prntfeatures($_,$pname[$i],$features,outputfile,$domain);
       
   193 			}
       
   194 			$features= substr ($relroad, $ppos[$i]);
       
   195 		
       
   196 			prntfeatures($_,$pname[$i],$features,outputfile,$domain);
       
   197 			@ppos ="";
       
   198 			@pname ="";
       
   199 			undef ($features);
       
   200 		}
       
   201 	}		 	
       
   202 
       
   203 	}
       
   204 	
       
   205 	
       
   206 
       
   207 	close (outputfile);
       
   208 
       
   209 
       
   210 }
       
   211 
       
   212 
       
   213 sub parse_category {
       
   214 
       
   215 	#arguments
       
   216 	($infile)=@_;
       
   217 
       
   218 	my @mylink;
       
   219 
       
   220 	$mypage=loadfile $infile;
       
   221 	$i=0;	
       
   222 	if ( $mypage =~ m/Pages in category(.*)\<\/table/sg) {
       
   223 		print "INFO - Category page found\n";
       
   224 		$mypage = $1;
       
   225 		
       
   226 		while ($mypage =~ m /\<a href\=\"(\/wiki\/index\.php\/.*?)\"/g) {
       
   227 			
       
   228 			$mylink[$i] = $1;	
       
   229 			$i++;
       
   230 			
       
   231 		}
       
   232 	print "INFO - Found $i items in the category page\n"
       
   233 	}
       
   234 	return @mylink;
       
   235 }
       
   236 
       
   237 sub bklog_domain {
       
   238  #argument
       
   239 ($mytechdomian)=@_;
       
   240 
       
   241   $mytechdomian =~s/\)//sg;
       
   242   $mytechdomian =~s/\(//sg;
       
   243   
       
   244  $domaininfo=loadfile ("package_domains.csv");
       
   245  if ($domaininfo =~ m/$mytechdomian.*?\{(.*?)\}/sgi ) {
       
   246       return $1;
       
   247  }
       
   248   print "ERROR - domain not found for $mytechdomian\n";
       
   249   return "orphan";
       
   250  
       
   251 }
       
   252 
       
   253 sub parse_bklog {
       
   254 	
       
   255 	#arguments
       
   256 	($infile,$outfile,$id)=@_;
       
   257 	$mypkg=loadfile $infile;
       
   258 	#list if the bklog has been ported to the new bugzilla based format
       
   259   $headerformat= "wiki_format";
       
   260   
       
   261 	
       
   262 	open ( outputfile, ">>".$outfile);
       
   263 	open ( soutputfile, ">>"."summary_".$outfile);
       
   264 	
       
   265 	if ($mypkg =~ m/index\.php\/(.*?) HTTP/sg) {
       
   266   
       
   267 		$pagename = $1;
       
   268 		
       
   269     if ($splitbklogs) {
       
   270       $whichtd =  bklog_domain($pagename);
       
   271       open ( tdoutputfile, ">>".$whichtd. "_".$outfile);
       
   272      }
       
   273     
       
   274     
       
   275     
       
   276     if (blacklist($pagename)) {
       
   277        
       
   278     	  close (outputfile);
       
   279 	      close (soutputfile);
       
   280 	    
       
   281 	      if ($splitbklogs) { close (tdoutputfile);}
       
   282         return 0;
       
   283     }
       
   284 		print "INFO -Processing Package $pagename \n";
       
   285 		$i=0;
       
   286 		$found_counter =0;
       
   287 		
       
   288 		if ($mypkg =~m/class\=\"bugzilla sortable\"/sg ) { $headerformat="autobug_format"; }
       
   289 		
       
   290 		while ($mypkg =~ m/\<tr.*?\>(.*?)\<\/tr/sg) { 
       
   291 			$myheader= $&;
       
   292       if ($myheader =~ m/style=\"background-color\:/sg) {
       
   293         next;
       
   294       }
       
   295 			$myfeat= $1;
       
   296 			$myfeat =~ s/\<\/td\>/\t/sg;
       
   297 			$myfeat =~ s/\<.*?\>//sg;
       
   298 			$myfeat =~ s/\n//sg;
       
   299 			
       
   300 			if ($myfeat =~ m/IDPStatus/sg) { #header for bugzilla mediawiki plugin
       
   301         next;
       
   302       }
       
   303 			
       
   304 			if ($myfeat =~ m/[A-z]/sg and not $myfeat =~ m/\&lt\;etc/sg and 
       
   305 			not $myfeat =~ m/\&lt\;Feature/sg and not $myfeat =~ m/Item not available/sg) {
       
   306 				print outputfile "$pagename\t$myfeat\n";
       
   307 				if ($splitbklogs) { print tdoutputfile "$pagename\t$myfeat\n";}
       
   308         
       
   309       #  print "matching $myfeat with $count_target\n"    ;
       
   310         if ($myfeat =~ m/$count_target/sg) {$found_counter++;}
       
   311         $i++;
       
   312 			}
       
   313 			
       
   314 		}
       
   315   
       
   316   if ($count_target){
       
   317     $mycount=$i."\t".$found_counter;
       
   318   } else {
       
   319     $mycount=$i;
       
   320   }
       
   321 	
       
   322 	if ($splitbklogs) { 
       
   323     print soutputfile "$id\t$pagename\t$mycount\t$headerformat\t$whichtd\thttp://developer.symbian.org/wiki/index.php/$pagename\n";
       
   324 	} else {
       
   325     print soutputfile "$id\t$pagename\t$mycount\t$headerformat\thttp://developer.symbian.org/wiki/index.php/$pagename\n";
       
   326   }
       
   327 
       
   328 	}
       
   329 
       
   330 	close (outputfile);
       
   331 	close (soutputfile);
       
   332   if ($splitbklogs) { close (tdoutputfile);}
       
   333                            
       
   334 }
       
   335 
       
   336 
       
   337 
       
   338 
       
   339 #help print
       
   340 sub printhelp
       
   341 {
       
   342        
       
   343 	print "\n\n version 1.1
       
   344 	\ngettd.pl -t=url -d=domain \n\nRequired parameters for Technology Roadmaps:\n\t -t url containing the technology domain roadmap\n\t -d the technology domain name
       
   345 	\n\nOptional Parmeters for Technology Roadmaps\n\t-new if the roadmap has the new wiki format
       
   346   \n\nRequired Parameters for Package backlogs\n\t-p for package backlog analysis. just run gettd.pl -p
       
   347   \n\nOptional Pararmeters for Package backlogs\n\t -compare [f1] [f2] compares two package summary files for changes ignores order
       
   348   \n\t -split splits the content of the backlog output into technology domains. requires package_domains.csv file with mapping details
       
   349   \n\t -count=regexp counts the times that a package backlog line matches the regexp, the results are output to the summary file
       
   350   \n\nCommonOptional parameters\n\t-o filename ,the output is logged into the output.csv file by default\n\t-h for help
       
   351 	\n\t recommend to run under cygwin environment and perl version v5.10.0 \n
       
   352   \n\t pages blacklisted for package backlogs are @blist\n";
       
   353 	exit;
       
   354 }
       
   355 
       
   356 
       
   357 
       
   358 #compare bklogs
       
   359 sub compare_bklogs {
       
   360 	#arguments
       
   361 	(@bklogs)=@_;
       
   362 	
       
   363 	if (not $#bklogs == 1) { printhelp;}
       
   364 
       
   365 	
       
   366 	$cmd ="cut -f 2,3 ". $bklogs[0] . " | sort -u > tmp1.txt";
       
   367 	
       
   368 	system($cmd);
       
   369 	
       
   370 	$cmd ="cut -f 2,3 ". $bklogs[1] . " | sort -u > tmp2.txt";
       
   371 	system($cmd);
       
   372 	
       
   373 	exec ("diff tmp1.txt tmp2.txt | grep '[<|>]'");
       
   374 	system("rm temp*.txt");
       
   375 	
       
   376 	exit;
       
   377 
       
   378 }
       
   379 
       
   380 
       
   381 
       
   382 
       
   383 #process command line options
       
   384 sub cmd_options
       
   385 {
       
   386 
       
   387   my $help;
       
   388   my @compare;
       
   389 
       
   390 
       
   391   GetOptions('h' => \$help,'t=s'=> \$target_url, 'd=s' => \$tdomain , 'o=s' => \$csvfile, 
       
   392 	'a' => \$authon , 'p' => \$ispackage, 'compare=s{2}' =>\@compare, 'new' => \$isnewformat,
       
   393    'split' => \$splitbklogs, 'count=s' => \$count_target);
       
   394 
       
   395   if (@compare) {
       
   396 	compare_bklogs @compare;
       
   397 	
       
   398   }
       
   399   if ($count_target) {
       
   400     print "INFO - Seaching for $count_target\n";
       
   401   }
       
   402 
       
   403   if ($help) {
       
   404     printhelp;
       
   405   }
       
   406 
       
   407 
       
   408  if ($ispackage) {
       
   409 
       
   410  	$tdomain =" ";
       
   411 	$target_url = "http://developer.symbian.org/wiki/index.php/Category:Package_Backlog";
       
   412 	
       
   413  }  
       
   414  if ($isnewformat){
       
   415     $newtdformat = 1;
       
   416  
       
   417  }
       
   418 
       
   419  if ( not $target_url) {
       
   420 
       
   421 	print "ERROR-missing arguments target url\n";
       
   422 	printhelp;	
       
   423   } 
       
   424 
       
   425 
       
   426  if (not $tdomain){
       
   427 	print "ERROR-missing arguments domain level\n";
       
   428 	printhelp;
       
   429  }
       
   430 
       
   431  	print "\nINFO-downloading $target_url with label $tdomain\n";
       
   432   
       
   433 
       
   434  if (not $csvfile) {
       
   435 	if (not $ispackage) { 
       
   436 		$csvfile="output.csv";
       
   437 		
       
   438 	} else {
       
   439 		$csvfile="output.txt";
       
   440 		system ("rm *output.txt");
       
   441 	
       
   442 	}
       
   443  }
       
   444  print "\nINFO-output recorded in $csvfile \n";
       
   445 
       
   446                                       
       
   447 
       
   448 }
       
   449 #main
       
   450 $/ = " ";
       
   451 $host1 = "developer.symbian.org";
       
   452 
       
   453 cmd_options();
       
   454 
       
   455 if ($authon) {
       
   456 	#file containing login details from http cookie
       
   457 	$mycookie = loadfile("mycookie.txt");
       
   458 
       
   459 	$auth = "Cookie: " . $mycookie ;
       
   460 }
       
   461 
       
   462 
       
   463 if ($ispackage) {
       
   464 	getpage($target_url, $host1, $auth, "debug.txt");
       
   465 	@bklog = parse_category("debug.txt");
       
   466 	$j=0;
       
   467 	
       
   468 	foreach (@bklog) {
       
   469 		getpage("http://".$host1.$_, $host1, $auth, "pkg".$j.".txt");
       
   470 		parse_bklog ("pkg".$j.".txt",$csvfile, $j);
       
   471 		$j++;
       
   472 		
       
   473 	
       
   474 
       
   475 	}
       
   476 
       
   477 } else {
       
   478 
       
   479 	#foundation releases - add as required
       
   480 	@releases=("Symbian\\^2","Symbian\\^3","Symbian\\^4");
       
   481 
       
   482 	getpage($target_url, $host1, $auth, "debug.txt");
       
   483 	td_roadmap("debug.txt" , $csvfile, $tdomain ,@releases);
       
   484 }