textstat.pl

Tagged:  •    •    •    •    •    •  

Compute text statistics for text or HTML files and print the information to STDOUT. You can specify the locale setting (using the standard POSIX module) to make sure that special characters of your language (e.g. ä, ö, ü) are matched by \w.

#!/usr/bin/perl
# textstat.pl computes some statistics for a given plain text or
# HTML file supplied as an argument on the command line: the total
# number of wordforms, the number of different wordforms, the number
# of sentences and the total and relative frequencies of the different
# wordforms. A file is considered an HTML file if it extension matches
# the regular expression: '\.html?$'. The second optional argument
# determines the number of items in the frequency report, the default
# value is 20. The third optional argument is a language code.
# If supplied the locale settings are adjusted accordingly. This is
# important if the text contains special characters that are not
# matched by '\w'. 'en_US' is the default locale setting.
#
# Copyright 2004, Ramiro Gómez.
#
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
use strict;
use locale;
use POSIX 'locale_h';

die "Usage: $0 file [int] [de_DE]\ne.g.: $0 index.html 1000 de_DE\n"
unless @ARGV;
my $file = shift;
my $length = int abs shift || 20;
my $lang = shift || 'en_US';
my $text;

# Set locale
setlocale(LC_CTYPE, $lang) or die "Invalid locale $lang: $!";

# is it a text file
if (-T $file) {
# is it an HTML file
if ($file =~ /\.html?$/i) {
use HTML::TokeParser;
my $parser = HTML::TokeParser->new( $file ) or die "Can't open $file: $!";
while (my $token = $parser->get_token) {
$text .= $parser->get_text;
}
}
else {
open(IN, $file) or die "Can't open $file: $!";
read(IN, $text, -s $file);
close(IN) or die "Can't close $file: $!";
}
} else {
die "$file is not a text file.\n";
}

statistics($file, $text, $length);

# Compute statistics and print results
sub statistics {
my $file = shift;
my $text = shift;
my $length = shift;

my %words;
my @words;
my $total_wordlength = 0;# total length of all words

$text =~ s/\s+/ /g; # replace one or more whitespace chars with ' '
my @sentences = split /\./, $text; # Array of sentences (rudimentary)

$text = lc $text; # lowercase text
$text =~ s/[^\w]/ /g; # replace anything but word characters
$text =~ s/[\d_]/ /g; # replace digits and underscores

# count words
while ($text =~ /(\w+)/g) {
push @words, $1;
$words{$1}++;
$total_wordlength += length($1);
}

# used for calculating average word length and relative frequency of words
my $number_of_words = scalar(@words);
die "No text found in $file\n" unless $number_of_words;

# print statistics
print "Text statistics for file: $file\n";
print "Total number of wordforms: $number_of_words\n";
print "Number of different wordforms: " . scalar(keys %words) . "\n";
print "Average length of a wordform in characters: " .
sprintf("%.2f", $total_wordlength / $number_of_words) . "\n";
print "Number of sentences: " . scalar(@sentences) . "\n\n";

my @sorted = sort { $words{$b} <=> $words{$a} } keys %words;

# more different word forms than desired list length
if (scalar(@sorted) > $length - 1) {
$length = $length - 1;
}
# fewer different word forms than desired list length
else {
$length = scalar(@sorted) - 1;
}

# report variables
my ($rank, $wordform, $occurrence, $rel_frequency);

# define report format
format STDOUT_TOP =
Frequency List Page: @>>>>>
$%
Rank Wordform Number of Relative
occurrences frequency in %
------------------------------------------------------------------
.
format STDOUT =
@>>> @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @>>>>>>>>>>>> @>>>>>>>>>>>>>
$rank, $wordform, $occurrence, $rel_frequency
.

# print frequency list
for (0..$length) {
$wordform = $sorted[$_];
$occurrence = $words{$wordform};
# relative frequency of word
$rel_frequency = sprintf("%.2f", 100 * $occurrence / $number_of_words);
$rank = $_ + 1;
write;
}
}

Post new comment

The content of this field is kept private and will not be shown publicly.
  • Web page addresses and e-mail addresses turn into links automatically.
  • Allowed HTML tags: <a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd> <p> <br>

More information about formatting options