here's the code if anyone can help:
Code:
#!/bin/gawk -f
# awk script to do cool stuff
# hyperlinks begin with '<a href=' so look for all lines which contain href
/href/ {
for (i=1; i< NF+1; i++) # for each record on line
{
if ( $i ~ "^href" ) # if the record begins with href
{
$f = substr($i , 7)
len = index($f , "\"")
$f = substr($f , 0 , len - 1)
if ($f != "") # if the entry isnt blank then we need to add it to an array
{
# if ends in html/htm/php:
if ( $f ~ "html$" || $f ~ "htm$" || $f ~ "php$" || $f ~ "shtml$")
{
htm[$f] = htm[$f] + 1 ; # add it to the html array
}
# otherwise we have another extension eg .exe)
else
{
def[$f] = def[$f] + 1 ; # add to default array
}
}
}
}
}
# image links begin with '<img src=' so look for all lines which contain src
/src/ {
for (i=1; i< NF; i++) # for each record on line
if ( $i ~ "^src" ) # if the record begins with src
{
len = length($i) -6; # work out where the end of the link is
$f = substr($i, 6, len ) # strip everything but the link (eg /img/picture.jpg)
# search for JPG/JPEG/GIF/PNG (upper/lowercase also)
if ($f ~ "[Jj][Pp][Gg]$" || $f ~ "[Jj][Pp][Ee][Gg]$" || $f ~ "[Gg][Ii][Ff]$" || $f ~ "[Pp][Nn][Gg]$" )
img[$f] = img[$f] + 1; # add to the image array
}
}
END{ print "\tHTML Documents: ";
for (entry in htm) print "\t"entry, htm[entry];
print "\n\tIMAGES: ";
for (entry in img) print "\t"entry, img[entry];
print "\n\tOTHER: ";
for (entry in def) print "\t"entry, def[entry];
}