Hello,
Here is a new version of my clustering program using the Perl Data Language.
Cheers,
Lino
#!/usr/bin/perl
use warnings;
use strict;
use PDL;
# fcm: fuzzy c-means implementation in Perl
# usage: $fcm [number_of_clusters] [fuzzification_factor]
# [max_iter] [tolerace]
# returns: prototypes, partition_matrix
#
#
# reading data
#
my ( @data, @tmp, $number_of_patterns, $max_row_number, $max_column_number );
while (defined(my $line = )) {
chomp ($line);
@tmp = split /\s+/, $line;
push @data, [ @tmp ];
}
$number_of_patterns = @data;
my $patterns = pdl(@data);
#
# assigning other variables
#
my $number_of_clusters = shift @ARGV;
my $fuzzification_factor = shift @ARGV;
my $max_iter = shift @ARGV;
my $tolerance = shift @ARGV;
unless (defined($number_of_clusters)) {
$number_of_clusters ||= 2;
}
unless (defined($fuzzification_factor)) {
$fuzzification_factor ||= 2.0;
}
unless (defined($max_iter)) { $max_iter ||= 40; }
unless (defined($tolerance)) { $tolerance ||= 0.00001; }
$number_of_clusters = abs($number_of_clusters);
$fuzzification_factor = abs($fuzzification_factor);
$max_iter = abs($max_iter);
$tolerance = abs($tolerance);
#
# initializing partition matrices
#
my $previous_partition_matrix;
my $current_partition_matrix =
initialize_partition_matrix($number_of_clusters, $number_of_patterns);
#
# output variables
#
my $prototypes;
my $performance_index;
#
# fuzzy c means implementation
#
$max_row_number = $number_of_patterns – 1;
$max_column_number = $number_of_clusters – 1;
my $iter = 0;
while (1) {
# computing each prototype
my $temporal_partition_matrix = $current_partition_matrix ** $fuzzification_factor;
my $temp_prototypes = mv( $temporal_partition_matrix x $patterns,1,0) / sumover($temporal_partition_matrix);
$prototypes = mv($temp_prototypes,1,0);
# copying partition matrix
$previous_partition_matrix = $current_partition_matrix->copy;
# updating the partition matrix
my $dist = zeroes $number_of_patterns, $number_of_clusters;
for my $i (0..$max_row_number){
for my $j (0..$max_column_number){
my $temp_distance = distance($patterns->slice(“:,$i”), $prototypes->slice(“:,$j”), \&euclidean );
$dist->set($i, $j, $temp_distance);
}
}
my $temp_variable = $dist ** (-2/($fuzzification_factor – 1));
$current_partition_matrix = $temp_variable / sumover(mv($temp_variable,1,0));
#
# Performance Index calculation
#
$temporal_partition_matrix = $current_partition_matrix ** $fuzzification_factor;
$performance_index = sum($temporal_partition_matrix * ( $dist ** 2 ));
# checking stop conditions
my $diff_partition_matrix = $current_partition_matrix – $previous_partition_matrix;
$iter++;
if ( ($diff_partition_matrix->max $max_iter) ) {
last;
}
print “iter = $iter\n”;
}
print “=======================================\n”;
print “clustering completed\n”;
print “performance index = $performance_index\n”;
print “prototypes = \n”;
print $prototypes;
print “current partition matrix = \n”;
print $current_partition_matrix;
# ================================
# initialize_partition_matrix
# partition_matrix =
# initialize_partition_matrix(
# num_clusters, num_patterns)
# ================================
sub initialize_partition_matrix {
my ($partition_matrix, $column_sum);
$partition_matrix = random($_[1],$_[0]);
$column_sum = sumover (mv($partition_matrix, 1, 0));#sum over columns
$partition_matrix /= $column_sum;
return $partition_matrix;
}
# ====================================
# compute distance between two vectors
# dist = distance( vector1, vector2, /&type_of_distance )
# ====================================
sub distance{
my ($vector1, $vector2, $type_of_distance) = @_;
my ($r) = $vector1 – $vector2;
$type_of_distance->($r);
}
sub manhattan{ sum(abs($_[0]));}
sub euclidean{ sum(sqrt($_[0] ** 2) );}
sub tschebyschev{ max(abs($_[0])); }
__DATA__
4.0 4.0
4.0 5.0
5.0 4.0
5.5 6.0
5.0 5.0
4.5 4.5
5.0 5.5
5.5 5.0
5.0 4.5
4.5 5.0
9.5 9.0
9.0 9.5
8.0 8.0
7.0 8.0
8.0 7.0
8.5 7.0
7.0 8.5
7.0 7.0
7.5 7.0
6.5 8.0
8.0 6.5
6.5 7.0
10.0 10.0
10.0 9.0
10.0 9.0
9.5 10.0
8.0 10.0
9.5 9.5
9.0 9.0
9.0 10.0
Recent Comments