#!/afs/wsi/@sys/bin/perl ############################################################## # # # RCL : Radius-based Competitive Learning # # # # ------------------------------------------------------- # # # # Author: Udo Heuser (heuser@informatik.uni-tuebingen.de) # # # # ------------------------------------------------------- # # # # Refs.: # # 1. [HeHe95] R. Henrion, G. Henrion: Multivariate # # Datenanalyse, Methodik und Anwendung in der # # Chemie, Springer-Verlag, 1995 (in German) # # # # 2. [ScEr97] E. Schikuta, M. Erhart: The BANG-Clus- # # tering System: Grid-Based Data Analysis, in # # X. Liu, P. Cohen, M. Berthold (Eds.): Advances # # in Intelligent Data Analysis (IDA-97), LNCS 1280 # # pp. 513-524, 1997 # # 3. [Sedg92] R. Sedgewick, Algorithms in C++, # # Addison-Wesley Publishing Company, 1992 # # # ############################################################## # RCL default documents dir $rcl_home = "/home/heuser/OASIS/RCL/Docs"; # # usage # if ( ( $ARGV[0] =~ "-help" ) || ( $ARGV[0] =~ "-h" ) ) { &usage; exit; } elsif ( $ARGV[0] =~ "-v" ) { print STDERR "rcl version 0\n"; print STDERR "last modified: Thu Oct 1 11:17:29 MET DST 1998\n"; exit; } # # read in args # while ( $ARGV[0] =~ /^-/ ) { if ( $ARGV[1] =~ /^-/ ) { print STDERR "$0: Error in reading arguments.\n"; &usage; exit(1); } else { if ( $ARGV[0] eq "-i" ) { $i = "$rcl_home/" . $ARGV[1]; } elsif ( $ARGV[0] eq "-o" ) { $o = ">$rcl_home/" . $ARGV[1]; } elsif ( $ARGV[0] eq "-oeq" ) { $o_eqe = ">$rcl_home/" . $ARGV[1]; } elsif ( $ARGV[0] eq "-rad" ) { $radius = $ARGV[1]; } elsif ( $ARGV[0] eq "-non" ) { $max_neurons = $ARGV[1]; } elsif ( $ARGV[0] eq "-ovl" ) { $overlap = $ARGV[1]; } elsif ( $ARGV[0] eq "-sig" ) { $sigma = $ARGV[1]; } elsif ( $ARGV[0] eq "-log" ) { $log = $ARGV[1]; } elsif ( $ARGV[0] eq "-debug" ) { $debug = $ARGV[1]; } shift; shift; } } # # global args initialisation # # RCL input file if ( !defined($i) ) { $i = "$rcl_home/gm_out"; } # RCL output file if ( !defined($o) ) { $o = ">$rcl_home/rcl_out"; } # RCL expected quantisation error output file if ( !defined($o_eqe) ) { $o_eqe = ">$rcl_home/rcl_eqe"; } # overlap if ( !defined($overlap) ) { $overlap = 0; } # standard deviation of init neurons from mean 0 # center init neurons around mean # if ( !defined($sigma) ) { $sigma = 0.02; } # scatter neurons thru input space if ( !defined($sigma) ) { $sigma = 0.5; } # RCL error log file if ( !defined($log) ) { $log = "$rcl_home/rcl.log"; } # RCL temp dir if ( !defined($tmp_dir) ) { $tmp_dir = "$rcl_home/tmp"; } # debug mode? if ( !defined($debug) ) { $debug = 0; } # define logging file if ( ($log eq "STDERR") || ($log eq "stderr") ) { $log = ">&STDERR"; } else { $log = ">" . $log; } if ( !defined open(LOG, $log) ) { &terminate("$0: Can't open $log for error logging: $!\n"); } # # global constant defs # # Min/Max integer values (machine dependent) $MAXINT = 1E308; $MININT = -1E308; # outer space boundary $OUT = 1.5; # calculate exact expected quantisation error? $ceqe_ext = 1; # calculate expected quantisation error using $radius? $ceqe_rad = 0; # calculate expected quantisation error using F-criterion? $ceqe_fcr = 1; # calculate expected quantisation error using normalized F-criterion? $ceqe_nfcr = 1; # fine tune results? $fine_tuning = 1; $const_nseq_rep_0 = 150; $const_nseq_rep_1 = 0.8; $const_eqe_der = 30; $const_pruning = 20; # # global vars initialisations # # dimension of input document vectors (profiles) $dim = 0; # no. of input document vectors $samples = 0; # no. of training cycles $lsteps = 0; # $min_sample threshold: no. of equally distributed document vectors # of every neuron with radius $radius $min_sample = $MAXINT; # boundaries of BANG cells @bound; # define overlap ranges if ( $overlap < 0 || $overlap >= 1 ) { &terminate("$0: overlap out of range [0;1[\n"); } # # main # # read document vectors and dim from documents input file &rcl_in; # define RCL neuron traces, centroids and sample output file if ( $dim < 4 ) { # neuron traces output file if ( !defined($o_nt) ) { $o_nt = ">$rcl_home/rcl_trace"; } # cluster centroids output file if ( !defined($o_ctr) ) { $o_ctr = ">$rcl_home/rcl_ctr"; } # current input sample output file if ( !defined($o_sam) ) { $o_sam = ">$rcl_home/rcl_osam"; } } # Generating $max_neurons neurons with initial randomized $dim-dim weights. print LOG "Initialize RCL neurons.\n"; if ( defined($max_neurons) ) { &rcl_init; } elsif ( defined($radius) ) { for ( $d=0; $d<$dim; ++$d ) { $bound[$d][0] = -1; $bound[$d][1] = 1; } $n = 0; &rcl_init_auto(*bound); } else { &terminate("$0: Neither radius nor no. of init neurons defined!\n"); } # print out startup values print LOG "\tradius: $radius, overlap: $overlap\n"; print LOG "\tno. of init neurons: $max_neurons\n"; print LOG "\tno. of document vectors: $samples, dim: $dim\n"; print LOG "\tmax. no. of training cycles: $lsteps\n"; print LOG "\tmin. no. of samples for radius $radius: $min_sample\n"; # print out init neuron traces if ( defined($o_nt) ) { &pr_ntr_init; } # print out init expected quantisation error if ( defined($o_eqe) ) { &pr_eqe_init; } # RCL training cycles if ( $lsteps ) { &rcl_lstep; } # display total consumed time and close report files ∑ # return no. of detected clusters # return $max_neurons; exit; # # subroutines # # print usage sub usage { # local($pr_name) = split(/\//, $0); # print STDERR "\nUsage: $pr_name \{-i \} \{-o \} \{-ont \}\n"; print STDERR "\nUsage: $0 \{-i \} \{-o \}\n"; print STDERR " \{-oeq \}\n"; print STDERR " [-rad ]0;1] | -non [1;inf[ ] \{-ovl [0;1[\} \{-sig [0;1]\}\n"; print STDERR " \{-log \} \{-debug [0|1]\}\n\n"; print STDERR "\t### RCL : Radius-based Competitive Learning. ###\n"; print STDERR "\t### Author: Udo Heuser (heuser\@informatik.uni-tuebingen.de) ###\n\n"; print STDERR "Options:\n"; print STDERR "\t-i : document vectors (profiles) input file\n"; print STDERR "\t-o : RCL output file\n"; print STDERR "\t-oeq : expected quantisation error output file\n"; print STDERR "\t-rad : radius\n"; print STDERR "\t-non : no. of init neurons\n"; print STDERR "\t-ovl : overlap\n"; print STDERR "\t-sig : sigma: standard deviation of init neurons from mean 0\n"; print STDERR "\n\t-log : (error) logging file\n"; print STDERR "\t-debug : debug mode?\n"; print STDERR "\t-h : help\n"; print STDERR "\t-v : rcl version\n"; print STDERR "\n"; } # read document vectors from documents input file # e.g.: $i = "$rcl_home/gm_out" sub rcl_in { if ( !defined($i) || !defined open(RCL_IN, $i) ) { &terminate("$0: Can't open $i: $!\n"); } local(@in_row) = ""; local($d) = 0; local($s) = 0; local($min) = $MAXINT; local($max) = $MININT; local($/) = "\n"; local($*) = 0; # processing lines instead of paragraphs print LOG "Read document vectors (profiles) from file $i.\n"; while ( $_ = ) { chop($_); @in_row = split(/\s+/, $_); if ( $in_row[0] eq "#" ) { if ( $in_row[1] =~ /^dim/ ) { $dim = $in_row[2]; } } elsif ( $in_row[0] ne "" ) { if ( !$dim ) { &terminate("$0: dim undefined!\n"); } for ( $d=0; $d<$dim; ++$d ) { if ( defined($in_row[$d]) ) { # if ( $in_row[$d] == 0 && $in_row[$d] ne "0" ) { &terminate("$0: Error in reading document vectors from $i (line $samples): not a number!\n"); } $vec[$samples][$d] = $in_row[$d]; if ( $vec[$samples][$d] > $max ) { $max = $vec[$samples][$d] }; if ( $vec[$samples][$d] < $min ) { $min = $vec[$samples][$d] }; # print LOG "vec[$samples][$d]: $vec[$samples][$d]\n" if ($debug); } else { &terminate("$0: Error in reading document vectors from $i (line $samples)\n"); } } ++$samples; } } # if not already scaled, scale to values [-1;1] if ( $max != 1 || $min != -1 ) { if ( ($max - $min) != 1 ) { print LOG "\tScaling input document vectors to [-1;1]\n"; for ( $d=0; $d<$dim; ++$d ) { for ( $s=0; $s<$samples; ++$s ) { $vec[$s][$d] = 2 * ($vec[$s][$d] - $min) / ($max - $min) - 1; } } } else { &terminate("$0: Error in scale: max - min == 1\n"); } } # no. of training cycles: heuristic value! $lsteps = $samples * 8; } # Generating $max_neurons neurons with initial randomized $dim dim. weights. sub rcl_init { local($d) = 0; local($n) = 0; local($i) = 0; local($sum) = 0; local($radius_calc) = 0; # if ( defined($radius) ) { # # calculating max. no. of neurons at given overlap and radius # $max_neurons_calc = 0; # local($centroid) = -1 + $radius; # local($b) = 0; # while ( ($b = ($centroid + $radius)) <= 1.0 ) { # # print STDERR "centroid: $centroid, radius: $radius, max_neurons: $max_neurons_calc, c+r: $b\n" if ($debug); # ++$max_neurons_calc; # $centroid += ( (2*$radius) - ($overlap*2*$radius) ); # } # $max_neurons_calc **= $dim; # if ( ($max_neurons_calc <= 0) ) { # &terminate("$0: max. no. of neurons out of range ]0;inf[\n"); # } # # if ( defined($max_neurons) ) { # if ( ($max_neurons <= 0) || ($max_neurons > $max_neurons_calc) ) { # &terminate("$0: max. no. of neurons out of range ]0;inf[\n"); # } # } # else { # $max_neurons = $max_neurons_calc; # } # # } # # elsif ( defined($max_neurons) ) { # calculating radius at given max. no. of neurons and overlap $radius_calc = (1+$overlap) * ( 1/(($max_neurons)**(1/$dim)) ); if ( ($radius_calc <= 0) || ($radius_calc > 1) ) { &terminate("$0: radius out of range ]0;1]\n"); } if ( defined($radius) ) { if ( ($radius <= 0) || ($radius > 1) || ($radius > $radius_calc) ) { &terminate("$0: radius out of range ]0;1]\n"); } } else { $radius = $radius_calc; } # $min_sample threshold: no. of equally distributed document vectors # of each neuron with radius $radius $min_sample = ((2*$radius)**$dim) / (2**$dim) * $samples; # $min_sample = ((($radius**$dim)/2)*(2**$dim)) / (2**$dim) * $samples; print LOG "Generating $max_neurons RCL neurons.\n"; print LOG "\tdim: $dim, standard deviation from mean 0: $sigma\n"; srand; for ( $n=0; $n<$max_neurons; ++$n ) { for ( $d=0; $d<$dim; ++$d ) { # calculating inner sum $sum = 0; for ( $i=0; $i<12; ++$i ) { $sum += rand(1); } $sum -= 6; # neuron[neuron number][step][dim] = mean + sigma * sum; $neuron[$n][0][$d] = 0 + $sigma * $sum; print LOG "neuron[no $n][step 0][dim $d] = $neuron[$n][0][$d]\n" if ($debug); } print LOG "\n" if ($debug); } } # automatic init of neurons using BANG like clustering (cf.[ScEr97]) sub rcl_init_auto { local(*bound) = pop(@_); local(@lbd); local(@llbd); local($d) = 0; local($i) = 0; local($j) = 0; local($j_ind) = 0; local($j_iind) = 0; local($lsampn) = 0; # local($llsampn) = 0; local($s_ind) = 0; local($in) = 1; # local($iin) = 1; # calculate cell boundaries @bound for ( $d=0; $d<$dim; ++$d ) { $lbd[$d][0] = $bound[$d][0]; $lbd[$d][1] = ($bound[$d][1] + $bound[$d][0])/2; $lbd[$d][2] = $bound[$d][1]; } for ( $d=0; $d<$dim; ++$d ) { for ( $i=0; $i<2; ++$i ) { $llbd[$d][$i][0] = $lbd[$d][$i]; $llbd[$d][$i][1] = $lbd[$d][$i+1]; } } for ( $j=0; $j<2**$dim; ++$j ) { $j_iind = $j; for ( $d=$dim-1; $d>=0; --$d ) { $j_ind = int($j_iind / (2**$d)); $j_iind = int($j_iind % (2**$d)); for ( $i=0; $i<2; ++$i ) { $bound[$d][$i] = $llbd[$d][$j_ind][$i]; } } $abs = $bound[0][1] - $bound[0][0]; if ( $abs < 0 ) { $abs = -$abs; } # calculate number of samples of cell limited by @bound $lsampn = 0; # $llsampn = 0; for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { $in = 1; # $iin = 1; for ( $d=0; $d<$dim; ++$d ) { if ( ($vec[$s_ind][$d] > $bound[$d][1]) || ($vec[$s_ind][$d] < $bound[$d][0]) ) { $in = 0; # $iin = 0; last; # break } # calculate number of samples of cell limited by $radius # if ( ($vec[$s_ind][$d] > ($bound[$d][0]+($bound[$d][1] - $bound[$d][0])/2 + $radius) ) || # ($vec[$s_ind][$d] < ($bound[$d][0]+($bound[$d][1] - $bound[$d][0])/2 - $radius)) ) { # $iin = 0; # } } if ( $in ) { ++$lsampn; } # if ( $iin ) { ++$llsampn; } } # recursive cell refinement if ( ($lsampn + $lsampn/(5*$dim)) > ($samples / ($dim**(2/$abs))) && ($lsampn > $samples/100) ) { # +$lsampn/(5*$dim) and ($lsampn > $samples/100): heuristic values!!! if ( ($abs / 2) <= $radius ) { # init neuron pos at cell center for ( $d=0; $d<$dim; ++$d ) { $neuron[$n][0][$d] = $bound[$d][0]+($bound[$d][1] - $bound[$d][0])/2; print LOG "neuron[no $n][step 0][dim $d] = $neuron[$n][0][$d]\n" if ($debug); } # define $min_sample threshold: minimum no. of samples for given radius if ( $lsampn < $min_sample ) { $min_sample = $lsampn; } ++$n; ++$max_neurons; } else { # refinement step &rcl_init_auto(*bound); } } } # for $j } # print out init neuron traces sub pr_ntr_init { # write updated gm_out file open(NTR_OUT, $o_nt) || &terminate("$0: Can't open $o_nt for writing: $!\n"); print LOG "Writing init neuron traces to file $o_nt.\n"; print(NTR_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(NTR_OUT "#\n\n"); for ( $n_ind=0; $n_ind<$max_neurons; ++$n_ind ) { print(NTR_OUT "# prune map:\t$n_ind\t1\n"); } print(NTR_OUT "\n"); for ( $n=0; $n<$max_neurons; ++$n ) { print(NTR_OUT "# RCL neuron $n\n"); for ( $d=0; $d<$dim; ++$d ) { print(NTR_OUT "$neuron[$n][0][$d] "); } print(NTR_OUT "\n\n"); } close(NTR_OUT); } # calculate and print out init expected quantisation errors sub pr_eqe_init { local($n); local($s_ind); local($in); local($d); local($eqe_n) = 0; local($eqe_ext) = 0; local($eqe_ext_nn) = 0; local($eqe_rad) = 0; local($eqe_fcr) = 0; local($eqe_nfcr) = 0; local($abs) = 0; local($min); local($in) = 0; local($l) = 0; local(@scount) = ""; # calculate EQE using exact Voronoi sets for all neurons if ( $ceqe_ext || $ceqe_fcr || $ceqe_nfcr ) { for ( $n=0; $n<$max_neurons; ++$n ) { $scount[$n] = 0; } for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { $min = $MAXINT; for ( $n=0; $n<$max_neurons; ++$n ) { $eqe_n = 0; for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $vec[$s_ind][$d]); $abs **= 2; $eqe_n += $abs; } if ( $eqe_n < $min ) { $min = $eqe_n; $in = $n; } } # for $n $eqe_ext += $min; # counting number of samples for each Voronoi set ++$scount[$in]; } # for $s_ind # normalize expected quantisation error $eqe_ext_nn = $eqe_ext; $eqe_ext /= $samples; } # calculate EQE using $radius to define approximated Voronoi sets if ( $ceqe_rad ) { for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { for ( $n=0; $n<$max_neurons; ++$n ) { $in = 1; $eqe_n = 0; for ( $d=0; $d<$dim; ++$d ) { if ( ($vec[$s_ind][$d] < ($neuron[$n][$l][$d] - $radius)) || ($vec[$s_ind][$d] > ($neuron[$n][$l][$d] + $radius)) ) { $in = 0; last; # break } else { $abs=($neuron[$n][$l][$d] - $vec[$s_ind][$d]); $abs **= 2; $eqe_n += $abs; } } if ( $in ) { # $eqe_rad += sqrt($eqe_n); $eqe_rad += $eqe_n; } } # for $n } # for $s_ind # normalize expected quantisation error $eqe_rad /= $samples; } # calculate EQE using F-criterion (cf. [HeHe96], p. 57) if ( $ceqe_fcr || $ceqe_nfcr ) { local(@mean) = ""; # calculate mean of total data set for ( $d=0; $d<$dim; ++$d ) { $mean[$d] = 0; } for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { for ( $d=0; $d<$dim; ++$d ) { $mean[$d] += $vec[$s_ind][$d]; } } # normalize $mean[$d] for ( $d=0; $d<$dim; ++$d ) { $mean[$d] /= $samples; } for ( $n=0; $n<$max_neurons; ++$n ) { $eqe_n = 0; for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $mean[$d]); $abs **= 2; $eqe_n += $abs; } $eqe_fcr += $eqe_n * $scount[$n]; } # for $n if ( $max_neurons > 1 ) { $eqe_fcr /= $eqe_ext_nn * ($samples - $max_neurons) / ($max_neurons - 1); } } # calculate EQE using normalized F-criterion if ( $ceqe_nfcr ) { $eqe_nfcr = $eqe_fcr / $max_neurons; } print LOG "$l: eqe_ext: $eqe_ext, eqe_rad: $eqe_rad, eqe_fcr: $eqe_fcr, eqe_nfcr: $eqe_nfcr\n" if ($debug); # init eqe output file open(EQE_OUT, $o_eqe) || &terminate("$0: Can't open $o_eqe for writing: $!\n"); print(EQE_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(EQE_OUT "#\n\n"); # order of eqe's are important! print(EQE_OUT "# RCL eqe 0\n"); if ( $ceqe_ext ) { print(EQE_OUT "0 $eqe_ext\n"); } print(EQE_OUT "\n"); print(EQE_OUT "# RCL eqe 1\n"); if ( $ceqe_rad ) { print(EQE_OUT "0 $eqe_rad\n"); } print(EQE_OUT "\n"); print(EQE_OUT "# RCL eqe 2\n"); if ( $ceqe_fcr ) { print(EQE_OUT "0 $eqe_fcr\n"); } print(EQE_OUT "\n"); print(EQE_OUT "# RCL eqe 3\n"); if ( $ceqe_nfcr ) { print(EQE_OUT "0 $eqe_nfcr\n"); } print(EQE_OUT "\n"); } # RCL training cycles sub rcl_lstep { local($l) = 0; local($l_bak) = 0; local($s); local(@nseq) = ""; local(@nrep) = ""; local(@sampn) = ""; local(@dist) = ""; local($in); local(@fixed) = ""; local($incs) = 0; local($ncon) = 0; local($na) = 0; local(@nadp); local($ns) = 0; local($n_oldp) = 0; local($n_oldp_1) = 0; local($natc) = 0; local($eqe) = 0; local($eqe_bak) = 0; local($nasum) = 0; local($opfound) = 0; local($adrt_0) = 0; local($adrt_1) = 0; local($adrt_2) = 0; local($term_learning) = 0; local($fine_tuning_init) = 0; local($ftradius) = $radius; local($s_ind) = 0; local($abs) = 0; local($out) = 0; print LOG "Start RCL learning.\n"; # init no. of samples per neuron at given radius, # is_fixed values and no. of not adapted neurons for ( $n=0; $n<$max_neurons; ++$n ) { $sampn[$n] = 0; $fixed[$n] = 0; $nadp[$n] = 0; &update_samples(*sampn, *fixed, $n, 0); } srand; # iterate thru no. of RCL learning steps do { if ( !($l%10) ) { if ( !($fine_tuning_init) ) { print STDERR "RCL learning step $l.\n"; } else { print STDERR "RCL fine tuning step $l.\n"; } } # init no. of not adapted neurons per training cycle $na = 0; # randomly choose input vector $s = int(rand($samples)); if ( defined($o_sam) ) { &record_sample($s); } # (quick-)sort neurons $neuron[$n][$l][$d] relative to distances # from chosen input vector $vec[$s][$d] # init current neuron sequence for ( $n=0; $n<$max_neurons; ++$n ) { for ( $d=0; $d<$dim; ++$d ) { $nseq[$n][$d] = $neuron[$n][$l][$d]; } } # quicksort current neuron sequence &quicksort(*nseq, $s, 0, $max_neurons-1); # record hashes for ( $n=0; $n<$max_neurons; ++$n ) { $ns = 0; $opfound = 0; do { for ( $d=0; $d<$dim; ++$d ) { if ( $neuron[$ns][$l][$d] != $nseq[$n][$d] ) { ++$ns; $opfound = 0; last; # break } else { $opfound = 1; } } } until ( $opfound ); # complete mapping: hash: $n($nseq) -> $n($neuron) $hash[$n] = $ns; } # to do: repell distant neurons from input? # $neu_dev = int($max_neurons * 2 / 3); # addapt neurons towards input vector for ( $n=0; $n<$max_neurons; ++$n ) { # old $n pos $n_oldp = $hash[$n]; for ( $d=0; $d<$dim; ++$d ) { $dist[$d] = ($vec[$s][$d] - $nseq[$n][$d]); # distance based adaptation # $nseq[$n][$d] += ( 0.8 * exp(-3.5 * $dist[$d]) ); # neuron sequence based adaptation # nseq := nseq + max. adapt rate * time dependecies * neuron sequence dep. * dist. # adrt decreasing from 100% to 0% through neuron sequence and ... if ( $max_neurons > 1 ) { $adrt_0 = exp(-5.5 * (($n - $na) / ($max_neurons-1))); } else { $adrt_0 = exp(-5.5 * ($n - $na)); } if ( $fine_tuning_init ) { # adrt decreasing from 50% to 0 % during fine tuning steps $adrt_1 = exp(-5 * ($l/$l_bak - 1)) / 2; } else { # adrt decreasing from 80% to 0 % during learning steps $adrt_1 = exp(-10 * $l / $lsteps) / 1.25; } $nseq[$n][$d] += $adrt_0 * $adrt_1 * $dist[$d]; # calculate pos of neuron to be repelled $nrep[$d] = $neuron[$n_oldp][$l][$d]; # adrt decreasing from 100% through neuron sequence and ... if ( $max_neurons > 1 ) { $adrt_0 = -exp(5*($n / ($max_neurons-1))) / $const_nseq_rep_0 + 1; } else { $adrt_0 = -exp(5 * $n) / $const_nseq_rep_0 + 1; } # adrt increasing from 30% to inf during learning steps $adrt_1 = exp(15 * $l / $lsteps) - $const_nseq_rep_1; $nrep[$d] -= $adrt_0 * $adrt_1 * $dist[$d]; } if ( $debug && !($l%10) ) { # if ( !($l%10) ) { $adrt_2 = $adrt_0 * $adrt_1; print LOG "\tadp rate for neuron $n:\n"; print LOG "\t$adrt_1 * $adrt_0 = $adrt_2\n"; } if ( $n == 0 ) { if ( &increase_samples(*sampn, *fixed, $n, $n_oldp) ) { # adapt neuron 0 of current neuron sequence print LOG "incs: 1\n" if ($debug); for ( $d=0; $d<$dim; ++$d ) { $neuron[$n_oldp][$l+1][$d] = $nseq[$n][$d]; $neuron[$n_oldp][$l][$d] = $neuron[$n_oldp][$l+1][$d]; } # unfix neuron 1 of current neuron sequence $n_oldp_1 = $hash[1]; if ( &neuron_conflict_mut_manh($n_oldp, $n_oldp_1) ) { $sampn[$n_oldp_1] = 0; $fixed[$n_oldp_1] = 0; } } else { # keep old neuron position print LOG "incs: 0\n" if ($debug); for ( $d=0; $d<$dim; ++$d ) { $neuron[$n_oldp][$l+1][$d] = $neuron[$n_oldp][$l][$d]; } ++$na; } } # if $n == 0 elsif ( &neuron_conflict_manh($n, $n_oldp) ) { if ( !($fixed[$n_oldp]) ) { # re-position neuron $n for ( $d=0; $d<$dim; ++$d ) { # repel neuron[neuron number][step][dim] from current input $neuron[$n_oldp][$l+1][$d] = $nrep[$d]; # update current neuron pos ( for &neuron_conflict($n) ) $neuron[$n_oldp][$l][$d] = $neuron[$n_oldp][$l+1][$d]; } # update sampn value and fix neuron if necessary if ( &repneu_conflict_manh($n_oldp, $l) ) { $sampn[$n_oldp] = 0; $fixed[$n_oldp] = 0; } else { &update_samples(*sampn, *fixed, $n_oldp, $l+1); } ++$na; } # if !($fixed[$n_oldp]) else { # keep old neuron position print LOG "incs: 0\n" if ($debug); for ( $d=0; $d<$dim; ++$d ) { $neuron[$n_oldp][$l+1][$d] = $neuron[$n_oldp][$l][$d]; } ++$na; } } # elsif neuron_conflict elsif ( &increase_samples(*sampn, *fixed, $n, $n_oldp) ) { print LOG "incs: 1\n" if ($debug); for ( $d=0; $d<$dim; ++$d ) { $neuron[$n_oldp][$l+1][$d] = $nseq[$n][$d]; $neuron[$n_oldp][$l][$d] = $neuron[$n_oldp][$l+1][$d]; } } # elsif increase_samples else { # keep old neuron position print LOG "incs: 0\n" if ($debug); for ( $d=0; $d<$dim; ++$d ) { $neuron[$n_oldp][$l+1][$d] = $neuron[$n_oldp][$l][$d]; } ++$na; } print LOG "n: $n_oldp, l: $l, fixed[$n_oldp]: $fixed[$n_oldp], sampn[$n_oldp]: $sampn[$n_oldp], max_neurons: $max_neurons, na: $na\n\n" if ($debug); } # for $n # "Vorwaerts immer, rueckwaerts nimmer" (Erich) ++$l; # pruning: discard neurons that haven't been adapted for long for ( $n=0; $n<$max_neurons; ++$n ) { if ( !($fixed[$n]) ) { ++$nadp[$n]; } else { $nadp[$n] = 0; } $out = 0; for ( $d=0; $d<$dim; ++$d ) { $abs = $neuron[$n][$l][$d]; if ( $abs < 0 ) { $abs = -$abs; } if ( $abs > $OUT ) { $out = 1; } } if ( ($nadp[$n] >= ($samples / $const_pruning)) || $out ) { &prune_neuron($n, $l); } if ( $max_neurons <= 1 ) { $term_learning = 1; } } # update neuron traces output file at learning step $l if ( defined($o_nt) ) { &pr_ntr_update($l); } system("gnuplot $rcl_home/ntraces") if ($debug); # system("gnuplot $rcl_home/gpl0") if ($debug); # system("gnuplot $rcl_home/gpl1") if ($debug); # system("gnuplot $rcl_home/gpl2") if ($debug); # calculate expected quantisation error for all neurons # and update EQE output file $eqe_bak = $eqe; $eqe = &eqe($l); # calculate EQE derivative as termination condition $eqe_der = ($eqe - $eqe_bak); if ( $eqe_der < 0 ) { $eqe_der = -$eqe_der; } if ( $eqe_der < 1E-3 ) { # 1E-4, 1E-5: heuristic values!!! ++$natc; } else { $natc = 0; } # RCL fine tuning cycles if ( ($natc >= ($samples / $const_eqe_der)) || ($l >= $lsteps) ) { if ( $fine_tuning ) { if ( !($fine_tuning_init) ) { # final pruning: discard neurons that aren't fixed or outside range &prune_neuron_final($l); # reset not adapted training cycles $natc = 0; # reset EQE derivative constant $const_eqe_der *= 1.5; # define fine tuning radius $ftradius *= 0.5; # define new $min_sample value $min_sample *= (0.5 ** $dim); if ( $min_sample < $samples/200 ) { $min_sample = $samples/200; } $min_sample = int($min_sample); print LOG "Start RCL fine tuning at learning step $l.\n"; print LOG "\texpected quantisation error: $eqe.\n"; print LOG "\tradius: $ftradius, overlap: $overlap\n"; print LOG "\tno. of neurons: $max_neurons\n"; # print LOG "\tno. of document vectors: $samples, dim: $dim\n"; # print LOG "\tmax. no. of training cycles: $lsteps\n"; print LOG "\tmin. no. of samples for radius $ftradius: $min_sample\n"; # record no. of current training cycle $l_bak = $l; for ( $n=0; $n<$max_neurons; ++$n ) { $sampn[$n] = 0; } for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { for ( $n=0; $n<$max_neurons; ++$n ) { $in = 1; for ( $d=0; $d<$dim; ++$d ) { if ( ($vec[$s_ind][$d] < ($nseq[$n][$d] - $ftradius)) || ($vec[$s_ind][$d] > ($nseq[$n][$d] + $ftradius)) ) { $in = 0; last; # break } } if ( $in ) { ++$sampn[$n]; } } } $fine_tuning_init = 1; } # if ( !$fine_tuning_init ) else { $term_learning = 1; } } # if ( $fine_tuning ) else { $term_learning = 1; } } # RCL fine tuning cycles # terminate RCL learning using expected quantisation error # (or on totally unadapted training cycles) } until ( $term_learning ); # final pruning: discard neurons that aren't fixed or outside range &prune_neuron_final($l); print LOG "Terminating RCL learning after $l_bak training cycles "; $l_bak = $l - $l_bak; print LOG "(plus $l_bak fine tuning steps).\n"; print LOG "\tfinal expected quantisation error: $eqe.\n"; # print out final neurons (cluster centroids) if ( defined($o_ctr) ) { &pr_centr($l); } # print RCL output if ( defined($o) ) { &pr_out($l); } } # record current sample to file sub record_sample { local($s) = pop(@_); open(SMP_OUT, $o_sam) || &terminate("$0: Can't open $o_sam for writing: $!\n"); print(SMP_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(SMP_OUT "#\n\n"); print(SMP_OUT "# RCL current sample vector $n\n"); for ( $d=0; $d<$dim; ++$d ) { print(SMP_OUT "$vec[$s][$d] "); } print(SMP_OUT "\n\n"); close(SMP_OUT); } # Quicksort: cf. [Sedg92], pp. 115 ff. # sorted neurons with smallest dist to vec at nseq-head sub quicksort { local(*nseq, $s, $le, $ri) = @_; local($i) = 0; local($j) = 0; local($d) = 0; local(@v) = ""; local($dist_v) = 0; local($dist_i) = 0; local($dist_j) = 0; local($abs) = 0; local($temp) = 0; if ( $ri > $le ) { $dist_v = 0; for ( $d=0; $d<$dim; ++$d ) { $v[$d] = $nseq[$ri][$d]; if ( ($v[$d] - $vec[$s][$d]) < 0 ) { $abs = -($v[$d] - $vec[$s][$d]); } else { $abs = ($v[$d] - $vec[$s][$d]); } # using Manhattan distance $dist_v += $abs; } $i = $le - 1; $j = $ri; for (;;) { do { ++$i; $dist_i = 0; for ( $d=0; $d<$dim; ++$d ) { if ( ($nseq[$i][$d] - $vec[$s][$d]) < 0 ) { $abs = -($nseq[$i][$d] - $vec[$s][$d]); } else { $abs = ($nseq[$i][$d] - $vec[$s][$d]); } # Manhattan distance $dist_i += $abs; } } until ( $dist_i >= $dist_v ); do { --$j; $dist_j = 0; for ( $d=0; $d<$dim; ++$d ) { if ( ($nseq[$j][$d] - $vec[$s][$d]) < 0 ) { $abs = -($nseq[$j][$d] - $vec[$s][$d]); } else { $abs = ($nseq[$j][$d] - $vec[$s][$d]); } # Manhattan distance $dist_j += $abs; } } until ( $dist_j <= $dist_v ); if ( $i >= $j ) { last; } # break # swap $nseq[$i] and $nseq[$j] for ( $d=0; $d<$dim; ++$d ) { $temp = $nseq[$i][$d]; $nseq[$i][$d] = $nseq[$j][$d]; $nseq[$j][$d] = $temp; } } # for (;;) # swap $nseq[$i] and $nseq[$ri] for ( $d=0; $d<$dim; ++$d ) { $temp = $nseq[$i][$d]; $nseq[$i][$d] = $nseq[$ri][$d]; $nseq[$ri][$d] = $temp; } &quicksort(*nseq, $s, $le, $i-1); &quicksort(*nseq, $s, $i+1, $ri); } } # update neuron $n's position if no. of included samples # at given radius excedes given threshold # (changes @sampn and @fixed arrays!) sub increase_samples { local(*sampn, *fixed, $nic, $n_oldp) = @_; local($lsampn) = 0; local($in); local($s_ind); local($d); local($update) = 0; for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { $in = 1; for ( $d=0; $d<$dim; ++$d ) { if ( ($vec[$s_ind][$d] < ($nseq[$nic][$d] - $ftradius)) || ($vec[$s_ind][$d] > ($nseq[$nic][$d] + $ftradius)) ) { $in = 0; last; # break } } if ( $in ) { ++$lsampn; } } print LOG "samples: n: $n_oldp, $sampn[$n_oldp] -> $lsampn\n" if ($debug); # update no. of included samples for neuron $nic ($n_oldp) if ( (!($fixed[$n_oldp])) || ($lsampn >= $sampn[$n_oldp]) ) { $sampn[$n_oldp] = $lsampn; # fix neuron position if included samples excede given threshold $min_sample if ( $lsampn >= $min_sample ) { $fixed[$n_oldp] = 1; } else { $fixed[$n_oldp] = 0; } $update = 1; } else { $update = 0; } # # fix neuron position if included samples excede given threshold $min_sample # if ( $lsampn >= $min_sample ) { # $fixed[$n_oldp] = 1; # } return $update; } # update @sampn and @fixed values (cf. &increase_samples()) sub update_samples { local(*sampn, *fixed, $n, $l) = @_; local($lsampn) = 0; local($in); local($s_ind); local($d); local($update) = 0; for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { $in = 1; for ( $d=0; $d<$dim; ++$d ) { if ( ($vec[$s_ind][$d] < ($neuron[$n][$l][$d] - $ftradius)) || ($vec[$s_ind][$d] > ($neuron[$n][$l][$d] + $ftradius)) ) { $in = 0; last; # break } } if ( $in ) { ++$lsampn; } } print LOG "samples: n: $n, $sampn[$n] -> $lsampn\n" if ($debug); # update no. of included samples for neuron $nic ($n_oldp) if ( (!($fixed[$n])) || ($lsampn >= $sampn[$n]) ) { $sampn[$n] = $lsampn; # fix neuron position if included samples excede given threshold $min_sample if ( $lsampn >= $min_sample ) { $fixed[$n] = 1; } else { $fixed[$n] = 0; } $update = 1; } else { $update = 0; } # # fix neuron position if included samples excede given threshold $min_sample # if ( $lsampn >= $min_sample ) { # $fixed[$n] = 1; # } return $update; } # is new position of neuron in concern $nic conflicting with # other neurons (depending on given $radius and $overlap) sub neuron_conflict { local($nic, $n_oldp) = @_; local($n) = 0; local($d) = 0; local($abs) = 0; local($in) = 1; for ( $n=0; $n<$max_neurons; ++$n ) { $in = 1; if ( $n != $n_oldp ) { for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $nseq[$nic][$d]); if ( $abs < 0 ) { $abs = -$abs; } if ( $abs > ((2*$radius) - ($overlap*$radius)) ) { $in = 0; last; # break } } if ( $in ) { return 1; } } # if $n } return 0; } # is new position of neuron in concern $nic conflicting with # other neurons (depending on given $radius and $overlap) # using Manhattan distance sub neuron_conflict_manh { local($nic, $n_oldp) = @_; local($n) = 0; local($d) = 0; local($abs) = 0; local($dist) = 0; for ( $n=0; $n<$max_neurons; ++$n ) { $dist = 0; if ( $n != $n_oldp ) { for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $nseq[$nic][$d]); if ( $abs < 0 ) { $abs = -$abs; } # Manhattan distance $dist += $abs; } if ( $dist < ((2*$radius) - ($overlap*$radius)) ) { return 1; } } } return 0; } # mutual neuron conflict sub neuron_conflict_mut { local($n0, $n1) = @_; local($d) = 0; local($abs) = 0; local($in) = 1; for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n0][$l][$d] - $neuron[$n1][$l][$d]); if ( $abs < 0 ) { $abs = -$abs; } if ( $abs > ((2*$radius) - ($overlap*$radius)) ) { $in = 0; last; # break } } if ( $in ) { return 1; } else { return 0; } } # mutual neuron conflict using Manhattan distance sub neuron_conflict_mut_manh { local($n0, $n1) = @_; local($d) = 0; local($abs) = 0; local($dist) = 0; local($in) = 1; for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n0][$l][$d] - $neuron[$n1][$l][$d]); if ( $abs < 0 ) { $abs = -$abs; } # Manhattan distance $dist += $abs; } if ( $dist < ((2*$radius) - ($overlap*$radius)) ) { return 1; } return 0; } # is position of repelled neuron conflicting with other neurons # (depending on given $radius and $overlap; cf. &neuron_conflict()) sub repneu_conflict { local($n_oldp, $l) = @_; local($n) = 0; local($d) = 0; local($abs) = 0; local($in) = 1; for ( $n=0; $n<$max_neurons; ++$n ) { $in = 1; if ( $n != $n_oldp ) { for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $neuron[$n_oldp][$l][$d]); if ( $abs < 0 ) { $abs = -$abs; } if ( $abs > ((2*$radius) - ($overlap*$radius)) ) { $in = 0; last; # break } } if ( $in ) { return 1; } } # if $n } return 0; } # is position of repelled neuron conflicting with other neurons # (depending on given $radius and $overlap; cf. &neuron_conflict()) # using Manhattan distance sub repneu_conflict_manh { local($n_oldp, $l) = @_; local($n) = 0; local($d) = 0; local($abs) = 0; local($dist) = 0; for ( $n=0; $n<$max_neurons; ++$n ) { $dist = 0; if ( $n != $n_oldp ) { for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $neuron[$n_oldp][$l][$d]); if ( $abs < 0 ) { $abs = -$abs; } # Manhattan distance $dist += $abs; } if ( $dist < ((2*$radius) - ($overlap*$radius)) ) { return 1; } } } return 0; } # pruning: discard neuron $n that hasn't been adapted for a long time # and record pruned neuron at neuron traces file sub prune_neuron { local($n, $l) = @_; print LOG "Pruning neuron $n at RCL learning step $l, "; if ( $n != ($max_neurons - 1) ) { for ( $d=0; $d<$dim; ++$d ) { $neuron[$n][$l][$d] = $neuron[$max_neurons-1][$l][$d]; } $sampn[$n] = $sampn[$max_neurons-1]; $fixed[$n] = $fixed[$max_neurons-1]; $nadp[$n] = $nadp[$max_neurons-1]; } --$max_neurons; print LOG "neurons left: $max_neurons\n"; if ( defined($o_nt) ) { # record pruned neuron at neuron traces file local(@in_row) = ""; local($l_old) = 0; local($l_ind) = 0; local($n_old) = 0; local($n_ind) = 0; local(@neuron_old) = ""; local($nn) = 0; local(@prn_map); local($iind) = 0; # open neuron traces file with read/write access local(@o_nt_rw_words) = split(/>/, $o_nt); local($o_nt_rw) = join("", @o_nt_rw_words); $o_nt_rw = "+<$o_nt_rw"; local($/) = "\n"; local($*) = 0; # processing lines instead of paragraphs open(NTR_OUT, $o_nt_rw) || &terminate("$0: Can't open $o_nt_rw for reading/writing: $!\n"); # read in old neuron traces while ( $_ = ) { chop($_); @in_row = split(/\s+/, $_); if ( $in_row[0] eq "" ) {} elsif ( $in_row[0] eq "#" ) { if ( ($in_row[1] eq "prune") && ($in_row[2] eq "map:") ) { $prn_map[$nn][0] = $in_row[3]; $prn_map[$nn][1] = $in_row[4]; ++$nn; } if ( ($in_row[1] eq "RCL") && ($in_row[2] eq "neuron") ) { $n_old = $in_row[3]; $l_old = 0; } } else { for ( $d=0; $d<$dim; ++$d ) { $neuron_old[$n_old][$l_old][$d] = $in_row[$d]; } ++$l_old; } } print LOG "Update neuron traces file $o_nt_rw.\n" if ($debug); seek(NTR_OUT, 0, 0); # setting filepointer at start of file print(NTR_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(NTR_OUT "#\n\n"); for ( $n_ind=0; $n_ind<$nn; ++$n_ind ) { if ( $n_ind == $n ) { if ( $n_ind != $max_neurons ) { $prn_map[$n_ind][0] = $prn_map[$max_neurons][0]; } else { $prn_map[$n_ind][0] = -1; } } elsif ( $n_ind == $max_neurons ) { $prn_map[$n_ind][0] = -1; } } for ( $n_ind=0; $n_ind<$nn; ++$n_ind ) { $prn_map[$n_ind][1] = 0; } for ( $n_ind=0; $n_ind<$nn; ++$n_ind ) { $iind = $prn_map[$n_ind][0]; if ( $iind != -1 ) { $prn_map[$iind][1] = 1; } } for ( $n_ind=0; $n_ind<$nn; ++$n_ind ) { print(NTR_OUT "# prune map:\t$prn_map[$n_ind][0]\t$prn_map[$n_ind][1]\n"); print LOG "# prune map : prn_map[$n_ind] :\t$prn_map[$n_ind][0]\t$prn_map[$n_ind][1]\n" if ($debug); } print(NTR_OUT "\n"); for ( $n_ind=0; $n_ind<$nn; ++$n_ind ) { print(NTR_OUT "# RCL neuron $n_ind\n"); for ( $l_ind=0; $l_ind<$l_old; ++$l_ind ) { for ( $d=0; $d<$dim; ++$d ) { print(NTR_OUT "$neuron_old[$n_ind][$l_ind][$d] "); } print(NTR_OUT "\n"); } print(NTR_OUT "\n\n"); } close(NTR_OUT); } # if ( defined($o_nt) ) } # final pruning: discard neurons that aren't fixed or outside range sub prune_neuron_final { local($l) = pop(@_); local($n); local($d); local($abs); local($out); for ( $n=$max_neurons-1; $n>=0; --$n ) { $out = 0; for ( $d=0; $d<$dim; ++$d ) { $abs = $neuron[$n][$l][$d]; if ( $abs < 0 ) { $abs = -$abs; } if ( $abs > $OUT ) { $out = 1; } } if ( (!($fixed[$n])) || $out ) { print LOG "Final pruning of neuron $n, "; if ( $n != ($max_neurons - 1) ) { for ( $d=0; $d<$dim; ++$d ) { $neuron[$n][$l][$d] = $neuron[$max_neurons-1][$l][$d]; } $sampn[$n] = $sampn[$max_neurons-1]; $fixed[$n] = $fixed[$max_neurons-1]; $nadp[$n] = $nadp[$max_neurons-1]; } --$max_neurons; print LOG "neurons left: $max_neurons\n"; } } # for $n } # calculate expected quantisation error sub eqe { local($l) = pop(@_); local($n); local($s_ind); local($in); local($d); local($eqe_n) = 0; local($eqe_ext) = 0; local($eqe_ext_nn) = 0; local($eqe_rad) = 0; local($eqe_fcr) = 0; local($eqe_nfcr) = 0; local($abs) = 0; local($min); local($in) = 0; local(@scount) = ""; # calculate EQE using exact Voronoi sets for all neurons if ( $ceqe_ext || $ceqe_fcr || $ceqe_nfcr ) { for ( $n=0; $n<$max_neurons; ++$n ) { $scount[$n] = 0; } for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { $min = $MAXINT; for ( $n=0; $n<$max_neurons; ++$n ) { $eqe_n = 0; for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $vec[$s_ind][$d]); $abs **= 2; $eqe_n += $abs; } if ( $eqe_n < $min ) { $min = $eqe_n; $in = $n; } } # for $n $eqe_ext += $min; # counting number of samples for each Voronoi set ++$scount[$in]; } # for $s_ind # not normalized eqe (for $ceqe_fcr) $eqe_ext_nn = $eqe_ext; # normalize expected quantisation error $eqe_ext /= $samples; } # calculate EQE using $radius to define approximated Voronoi sets if ( $ceqe_rad ) { for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { for ( $n=0; $n<$max_neurons; ++$n ) { $in = 1; $eqe_n = 0; for ( $d=0; $d<$dim; ++$d ) { if ( ($vec[$s_ind][$d] < ($neuron[$n][$l][$d] - $radius)) || ($vec[$s_ind][$d] > ($neuron[$n][$l][$d] + $radius)) ) { $in = 0; last; # break } else { $abs=($neuron[$n][$l][$d] - $vec[$s_ind][$d]); $abs **= 2; $eqe_n += $abs; } } if ( $in ) { # $eqe_rad += sqrt($eqe_n); $eqe_rad += $eqe_n; } } # for $n } # for $s_ind # normalize expected quantisation error $eqe_rad /= $samples; } # calculate EQE using F-criterion (cf. [HeHe96], p. 57) if ( $ceqe_fcr || $ceqe_nfcr ) { local(@mean) = ""; for ( $d=0; $d<$dim; ++$d ) { $mean[$d] = 0; } for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { for ( $d=0; $d<$dim; ++$d ) { $mean[$d] += $vec[$s_ind][$d]; } } # normalize $mean[$d] for ( $d=0; $d<$dim; ++$d ) { $mean[$d] /= $samples; } for ( $n=0; $n<$max_neurons; ++$n ) { $eqe_n = 0; for ( $d=0; $d<$dim; ++$d ) { $abs=($neuron[$n][$l][$d] - $mean[$d]); $abs **= 2; $eqe_n += $abs; } $eqe_fcr += $eqe_n * $scount[$n]; } # for $n if ( $max_neurons > 1 ) { $eqe_fcr /= $eqe_ext_nn * ($samples - $max_neurons) / ($max_neurons - 1); } } # calculate EQE using normalized F-criterion if ( $ceqe_nfcr ) { $eqe_nfcr = $eqe_fcr / $max_neurons; } print LOG "$l: eqe_ext: $eqe_ext, eqe_rad: $eqe_rad, eqe_fcr: $eqe_fcr, eqe_nfcr: $eqe_nfcr\n" if ($debug); # update expected quantisation error # open expected quantisation error file with read/write access local(@o_eqe_rw_words) = split(/>/, $o_eqe); local($o_eqe_rw) = join("", @o_eqe_rw_words); local(@eqe) = ""; local($i) = 0; local($i_ind); local($l_ind); local(@in_row); local(@def) = ""; local($s_max) = 0; $o_eqe_rw = "+<$o_eqe_rw"; local($/) = "\n"; local($*) = 0; # processing lines instead of paragraphs open(EQE_OUT, $o_eqe_rw) || &terminate("$0: Can't open $o_eqe_rw for reading/writing: $!\n"); # read in old expected quantisation errors $eqe_n = -1; while ( $_ = ) { chop($_); @in_row = split(/\s+/, $_); if ( $in_row[0] eq "" ) { if ( ($eqe_n != -1) && !($s_ind) ) { $def[$eqe_n] = 0; } elsif ( ($eqe_n != -1) && ($s_ind) ) { $def[$eqe_n] = 1; } } elsif ( $in_row[0] eq "#" ) { if ( ($in_row[1] eq "RCL") && ($in_row[2] eq "eqe") ) { $eqe_n = $in_row[3]; $s_ind = 0; } } else { $eqe[$eqe_n][$s_ind] = $in_row[1]; ++$s_ind; if ( $s_ind > $s_max ) { $s_max = $s_ind; } } } print LOG "Update expected quantisation error file $o_eqe_rw.\n" if ($debug); seek(EQE_OUT, 0, 0); # setting filepointer at start of file print(EQE_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(EQE_OUT "#\n\n"); for ( $i_ind=0; $i_ind<=$eqe_n; ++$i_ind ) { print(EQE_OUT "# RCL eqe $i_ind\n"); if ( $def[$i_ind] ) { for ( $l_ind=0; $l_ind<$s_max; ++$l_ind ) { print(EQE_OUT "$l_ind $eqe[$i_ind][$l_ind]\n"); } # order of eqe's are important! if ( ($i_ind == 0) && ($ceqe_ext) ) { print(EQE_OUT "$l $eqe_ext\n\n"); } elsif ( ($i_ind == 1) && ($ceqe_rad) ) { print(EQE_OUT "$l $eqe_rad\n\n"); } elsif ( ($i_ind == 2) && ($ceqe_fcr) ) { print(EQE_OUT "$l $eqe_fcr\n\n"); } elsif ( ($i_ind == 3) && ($ceqe_nfcr) ) { print(EQE_OUT "$l $eqe_nfcr\n\n"); } } else { print(EQE_OUT "\n"); } } close(EQE_OUT); # return EQE value serving as termination condition return $eqe_ext; } # update neuron traces at learning step $l sub pr_ntr_update { local($l) = pop(@_); local(@in_row) = ""; local(@l_old); local($l_old_ind) = 0; local($l_ind) = 0; local($n_old) = 0; local($n_ind) = 0; local(@neuron_old) = ""; local($nn) = 0; local(@prn_map); local($iind) = 0; # open neuron traces file with read/write access local(@o_nt_rw_words) = split(/>/, $o_nt); local($o_nt_rw) = join("", @o_nt_rw_words); $o_nt_rw = "+<$o_nt_rw"; local($/) = "\n"; local($*) = 0; # processing lines instead of paragraphs open(NTR_OUT, $o_nt_rw) || &terminate("$0: Can't open $o_nt_rw for reading/writing: $!\n"); # read in old neuron traces while ( $_ = ) { chop($_); @in_row = split(/\s+/, $_); if ( $in_row[0] eq "" ) {} elsif ( $in_row[0] eq "#" ) { if ( ($in_row[1] eq "prune") && ($in_row[2] eq "map:") ) { $prn_map[$nn][0] = $in_row[3]; $prn_map[$nn][1] = $in_row[4]; ++$nn; } if ( ($in_row[1] eq "RCL") && ($in_row[2] eq "neuron") ) { $n_old = $in_row[3]; $l_old[$n_old] = 0; $l_old_ind = 0; } } else { for ( $d=0; $d<$dim; ++$d ) { $neuron_old[$n_old][$l_old_ind][$d] = $in_row[$d]; } ++$l_old_ind; } $l_old[$n_old] = $l_old_ind; } print LOG "Update neuron traces file $o_nt_rw.\n" if ($debug); seek(NTR_OUT, 0, 0); # setting filepointer at start of file print(NTR_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(NTR_OUT "#\n\n"); for ( $n_ind=0; $n_ind<$nn; ++$n_ind ) { print(NTR_OUT "# prune map:\t$prn_map[$n_ind][0]\t$prn_map[$n_ind][1]\n"); } print(NTR_OUT "\n"); $iind = 0; for ( $n_ind=0; $n_ind<$nn; ++$n_ind ) { print(NTR_OUT "# RCL neuron $n_ind\n"); for ( $l_ind=0; $l_ind<$l_old[$n_ind]; ++$l_ind ) { for ( $d=0; $d<$dim; ++$d ) { print(NTR_OUT "$neuron_old[$n_ind][$l_ind][$d] "); } print(NTR_OUT "\n"); } if ( $prn_map[$n_ind][1] ) { for ( $d=0; $d<$dim; ++$d ) { print(NTR_OUT "$neuron[$iind][$l][$d] "); } print(NTR_OUT "\n"); ++$iind; } print(NTR_OUT "\n"); } close(NTR_OUT); } # print out final neuron positions as cluster centroids sub pr_centr { local($l) = pop(@_); open(CTR_OUT, $o_ctr) || &terminate("$0: Can't open $o_ctr for writing: $!\n"); print LOG "Writing final RCL neurons (cluster centroids) to file $o_ctr.\n"; print(CTR_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(CTR_OUT "#\n\n"); for ( $n=0; $n<$max_neurons; ++$n ) { print(CTR_OUT "# RCL cluster centroid $n\n"); for ( $d=0; $d<$dim; ++$d ) { print(CTR_OUT "$neuron[$n][$l][$d] "); } print(CTR_OUT "\n\n"); } close(CTR_OUT); } # print RCL output sub pr_out { local($l) = pop(@_); local($s_ind) = 0; local($in) = 1; local($d) = 0; local($n) = 0; local($i) = 0; local(@i_bak); local(@lsampn); local($tsampn) = 0; local(@cl_vec); local($abs) = 0; local(@dist) = 0; local(@dist_bak) = 0; local(@mc_doc); # back up old rcl_out file local($o_bak) = $o . ".bak"; local(@o_org_words) = split(/>/, $o); local($o_org) = join("", @o_org_words); local(@o_bak_words) = split(/>/, $o_bak); $o_bak = join("", @o_bak_words); system("cp -f $o_org $o_bak"); for ( $n=0; $n<$max_neurons; ++$n ) { $lsampn[$n] = 0; $i_bak[$n] = 0; $dist[$n] = 0; $dist_bak[$n] = $MAXINT; $mc_doc[$n] = 0; } for ( $s_ind=0; $s_ind<$samples; ++$s_ind ) { for ( $n=0; $n<$max_neurons; ++$n ) { $in = 1; $dist[$n] = 0; for ( $d=0; $d<$dim; ++$d ) { # determine most centered doc if ( ($vec[$s_ind][$d] < ($neuron[$n][$l][$d] - $radius)) || ($vec[$s_ind][$d] > ($neuron[$n][$l][$d] + $radius)) ) { $in = 0; last; # break } else { $abs=($neuron[$n][$l][$d] - $vec[$s_ind][$d]); if ( $abs < 0 ) { $abs = -$abs; } # Manhattan distance $dist[$n] += $abs; } } # if ( $dist[$n] < ((2*$radius) - ($overlap*$radius)) ) { if ( $in ) { if ( $dist[$n] < $dist_bak[$n] ) { # define most centered doc $mc_doc[$n] = $s_ind; $dist_bak[$n] = $dist[$n]; } # calculate no. of (total) included samples ++$lsampn[$n]; ++$tsampn; # record included vec no.s $i = $i_bak[$n]; $cl_vec[$n][$i] = $s_ind; ++$i_bak[$n]; } # if $in } # for $n } # for $s_ind # no. of unclustered docs $tsampn = $samples - $tsampn; open(RCL_OUT, $o) || &terminate("$0: Can't open $o for writing: $!\n"); print LOG "Writing RCL output to file $o.\n"; print(LOG "\tno. of input docs: $samples, dim: $dim\n"); print(LOG "\tno. of detected clusters: $max_neurons\n"); print(LOG "\tno. of unclustered docs: $tsampn\n"); print(RCL_OUT "# DO NOT EDIT BELOW THIS LINE!\n"); print(RCL_OUT "#\n"); print(RCL_OUT "# file format:\n"); print(RCL_OUT "# \tcluster centroid vector of ith cluster\n"); print(RCL_OUT "# \tno. of clustered docs of ith cluster\n"); print(RCL_OUT "# \tmost centered document (doc number)\n"); print(RCL_OUT "# \tclustered docs (doc numbers)\n"); print(RCL_OUT "#\n"); print(RCL_OUT "# no. of input docs: $samples, dim: $dim\n"); print(RCL_OUT "# no. of detected clusters: $max_neurons\n"); print(RCL_OUT "# no. of unclustered docs: $tsampn "); $tsampn = $tsampn / $samples * 100; print(RCL_OUT "($tsampn %)\n"); print(RCL_OUT "#\n\n"); for ( $n=0; $n<$max_neurons; ++$n ) { print(RCL_OUT "# RCL cluster $n\n"); for ( $d=0; $d<$dim; ++$d ) { print(RCL_OUT "$neuron[$n][$l][$d] "); } print(RCL_OUT "\n"); print(RCL_OUT "$lsampn[$n]\n"); print(RCL_OUT "$mc_doc[$n]\n"); for ( $i=0; $i<$i_bak[$n]; ++$i ) { print(RCL_OUT "$cl_vec[$n][$i]\n"); } print(RCL_OUT "\n"); } close(RCL_OUT); } # terminate process with error message sub terminate { local($err_msg) = pop(@_); # print error message print LOG $err_msg, "Aborting.\n"; # garbage collection close(RCL_IN); close(RCL_OUT); close(CTR_OUT); close(SMP_OUT); close(NTR_OUT); close(EQE_OUT); close(LOG); exit(1); } # exit execution with resume message sub sum { local($user); local($system); local($cuser); local($csystem); ($user,$system,$cuser,$csystem) = times; local($total) = $user + $system + $cuser + $csystem; local($sec); local($min); local($h); print LOG "\n"; print LOG "--------------------------- time consumed (in secs) ---------------------------\n"; print LOG "process\t\t\t\tchild processes\n"; print LOG "user time\tsystem time\tuser time\tsystem time\n"; print LOG "$user\t\t$system\t\t$cuser\t\t$csystem\n"; print LOG "total consumed time: $total sec "; $sec = $total % 60; $total /= 60; $min = $total % 60; $total /= 24; $h = $total % 24; print LOG "= $h h $min min $sec sec.\n"; print LOG "\n"; print LOG "$0 finished\!\n"; close LOG; return; }