contrib/network_model/regress.py

   1 #!/usr/bin/env python
   2
   3 import sys
   4 from math import sqrt
   5
   6
   7 if len(sys.argv) != 2 and len(sys.argv) != 4:
   8         print("Usage : %s datafile", sys.argv[0])
   9         print("or    : %s datafile p1 p2", sys.argv[0])
  10         print("where : p1 < p2 belongs to sizes in datafiles")
  11         sys.exit(-1)
  12
  13 if len(sys.argv) == 4:
  14         p1=int(sys.argv[2])
  15         p2=int(sys.argv[3])
  16
  17 ##-----------------------------------------
  18 ## avg : return average of a list of values
  19 ## param l list of values
  20 ##-----------------------------------------
  21 def avg (l):
  22         sum=0
  23         for e in l:
  24                 sum=sum+e;
  25         return sum/len(l)
  26
  27 ##-------------------------------------------------
  28 ## cov : covariance
  29 ## param X first data vector (..x_i..)
  30 ## param Y second data vector (..x_i..)
  31 ## = 1/n \Sum_{i=1}^n (x_i - avg(x)) * (y_i - avg(y))
  32 ##--------------------------------------------------
  33 def cov(X,Y):
  34
  35         n=len(X)   #  n=len(X)=len(Y)
  36         avg_X = avg( X )
  37         avg_Y = avg( Y )
  38         S_XY=0
  39         for i in range(n):
  40                 S_XY = S_XY + ((X[i]-avg_X)*(Y[i]-avg_Y))
  41
  42         return (S_XY/n)
  43
  44
  45 ##----------------------------------
  46 ## variance : variance
  47 ## param X data vector ( ..x_i.. )
  48 ## (S_X)^2 = (Sum ( x_i - avg(x) )^2 ) / n
  49 ##----------------------------------
  50 def variance( X ):
  51         S_X2 = 0
  52         n = len( X )
  53         avg_X = avg ( X )
  54         for i in range(n):
  55                 S_X2 = S_X2 + ((X[i] - avg_X)**2)
  56
  57         return (S_X2/n)
  58
  59 ##-----------------------------------------------------------------------------------------------
  60 ## correl_split_weighted : compute regression on each segment and
  61 ## return the weigthed sum of correlation coefficients
  62 ## param X first data vector (..x_i..)
  63 ## param Y second data vector (..x_i..)
  64 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
  65 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
  66 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment, n1 the number of values
  67 ##          i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
  68 ##          ...
  69 ##    and C=c1/n1+c2/n2+...
  70 ##-----------------------------------------------------------------------------------------------
  71 def correl_split_weighted( X , Y , segments ):
  72         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
  73         correl = list();
  74         interv = list();  # regr. line coeffs and range
  75         glob_corr=0
  76         sum_nb_val=0
  77         for (start,stop) in segments:
  78                 sum_nb_val = sum_nb_val + stop - start;
  79                 #if start==stop :
  80                 #       return 0
  81                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
  82                 S_X2 = variance(  X [start:stop+1] )
  83                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
  84                 if S_X2*S_Y2 == 0:
  85                         return (0,[])
  86                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
  87                 a = S_XY/S_X2                           # regr line coeffs
  88                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
  89                 print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
  90                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
  91                 interv.append( (a,b, X[start],X[stop]) );
  92
  93         for (c,l) in correl:
  94                 glob_corr = glob_corr + (l/sum_nb_val)*c  # weighted product of correlation
  95                 print('-- %f * %f' % (c,l/sum_nb_val))
  96
  97         print("-> glob_corr=%f\n" % glob_corr)
  98         return (glob_corr,interv);
  99
 100
 101
 102
 103 ##-----------------------------------------------------------------------------------------------
 104 ## correl_split : compute regression on each segment and
 105 ## return the product of correlation coefficient
 106 ## param X first data vector (..x_i..)
 107 ## param Y second data vector (..x_i..)
 108 ## param segments list of pairs (i,j) where i refers to the ith value in X, and  jth value in X
 109 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
 110 ##    where i1,j1 is the first segment,  c1 the correlation coef on this segment,
 111 ##          i2,j2 is the second segment, c2 the correlation coef on this segment,
 112 ##          ...
 113 ##    and C=c1*c2*...
 114 ##-----------------------------------------------------------------------------------------------
 115 def correl_split( X , Y , segments ):
 116         # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
 117         correl = list();
 118         interv = list();  # regr. line coeffs and range
 119         glob_corr=1
 120         for (start,stop) in segments:
 121                 #if start==stop :
 122                 #       return 0
 123                 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
 124                 S_X2 = variance(  X [start:stop+1] )
 125                 S_Y2 = variance(  Y [start:stop+1] )  # to compute correlation
 126                 if S_X2*S_Y2 == 0:
 127                         return (0,[])
 128                 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
 129                 a = S_XY/S_X2                           # regr line coeffs
 130                 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
 131                 print("   range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
 132                 correl.append( (c, stop-start) );   # store correl. coef + number of values (segment length)
 133                 interv.append( (a,b, X[start],X[stop]) );
 134
 135         for (c,l) in correl:
 136                 glob_corr = glob_corr * c  # product of correlation coeffs
 137         print("-> glob_corr=%f\n" % glob_corr)
 138         return (glob_corr,interv);
 139
 140
 141
 142 ##-----------------------------------------------------------------------------------------------
 143 ## main
 144 ##-----------------------------------------------------------------------------------------------
 145 sum=0
 146 nblines=0
 147 skampidat = open(sys.argv[1], "r")
 148
 149
 150 ## read data from skampi logs.
 151 timings = []
 152 sizes = []
 153 readdata =[]
 154 for line in skampidat:
 155         l = line.split();
 156         if line[0] != '#' and len(l) >= 3:   # is it a comment ?
 157         ## expected format
 158         ## ---------------
 159         #count= 8388608  8388608  144916.1       7.6       32  144916.1  143262.0
 160         #("%s %d %d %f %f %d %f %f\n" % (countlbl, count, countn, time, stddev, iter, mini, maxi)
 161                 readdata.append( (int(l[1]),float(l[3]) / 2 ) );   # divide by 2 because of ping-pong measured
 162                 nblines=nblines+1
 163
 164 ## These may not be sorted so sort it by message size before processing.
 165 sorteddata = sorted( readdata, key=lambda pair: pair[0])
 166 sizes,timings = zip(*sorteddata);
 167
 168
 169 ##----------------------- search for best break points-----------------
 170 ## example
 171 ## p1=2048  -> p1inx=11  delta=3 -> [8;14]
 172 ##                                              8 : segments[(0,7),(8,13),(13,..)]
 173 ##                                              ....
 174 ## p2=65536 -> p2inx=16  delta=3 -> [13;19]
 175
 176 if len(sys.argv) == 4:
 177
 178         p1inx = sizes.index( p1 );
 179         p2inx = sizes.index( p2 );
 180         max_glob_corr = 0;
 181         max_p1inx = p1inx
 182         max_p2inx = p2inx
 183
 184         ## tweak parameters here to extend/reduce search
 185         search_p1 = 30          # number of values to search +/- around p1
 186         search_p2 = 65          # number of values to search +/- around p2
 187         min_seg_size = 3
 188
 189         lb1 = max(1, p1inx-search_p1)
 190         ub1 = min(p1inx+search_p1,search_p1, p2inx);
 191         lb2 = max(p1inx,p2inx-search_p2)        # breakpoint +/- delta
 192         ub2 = min(p2inx+search_p2,len(sizes)-1);
 193
 194         print("** evaluating over \n");
 195         print("interv1:\t %d <--- %d ---> %d" % (sizes[lb1],p1,sizes[ub1]))
 196         print("rank:   \t (%d)<---(%d)--->(%d)\n" % (lb1,p1inx,ub1))
 197         print("interv2:\t\t %d <--- %d ---> %d" % (sizes[lb2],p2,sizes[ub2]))
 198         print("rank:   \t\t(%d)<---(%d)--->(%d)\n" % (lb2,p2inx,ub2))
 199         for i in range(lb1,ub1+1):
 200                 for j in range(lb2,ub2+1):
 201                         if i<j:         # segments must not overlap
 202                                 if i+1 >=min_seg_size and j-i+1 >= min_seg_size and len(sizes)-1-j >= min_seg_size : # not too small segments
 203                                         print("** i=%d,j=%d" % (i,j))
 204                                         segments = [(0,i),(i,j),(j,len(sizes)-1)]
 205                                         (glob_cor, interv) = correl_split( sizes, timings, segments)
 206                                         if ( glob_cor > max_glob_corr):
 207                                                 max_glob_corr = glob_cor
 208                                                 max_interv = interv
 209
 210         for (a,b,i,j) in max_interv:
 211                 print("** OPT: [%d .. %d]" % (i,j))
 212         print("** Product of correl coefs = %f" % (max_glob_corr))
 213
 214         print("#-------------------- cut here the gnuplot code -----------------------------------------------------------\n");
 215         preamble='set output "regr.eps"\n\
 216 set terminal postscript eps color\n\
 217 set key left\n\
 218 set xlabel "Each message size in bytes"\n\
 219 set ylabel "Time in us"\n\
 220 set logscale x\n\
 221 set logscale y\n\
 222 set grid'
 223
 224         print(preamble);
 225         print('plot "%s" u 3:4:($5) with errorbars title "skampi traces %s",\\' % (sys.argv[1],sys.argv[1]));
 226         for (a,b,i,j) in max_interv:
 227                 print('"%s" u (%d<=$3 && $3<=%d? $3:0/0):(%f*($3)+%f) w linespoints title "regress. %s-%s bytes",\\' % (sys.argv[1],i,j,a,b,i,j))
 228
 229         print("#-------------------- /cut here the gnuplot code -----------------------------------------------------------\n");
 230
 231
 232 else:
 233         print('\n** Linear regression on %d values **\n' % (nblines))
 234         print('\n   sizes=',sizes,'\n\n')
 235         avg_sizes = avg( sizes )
 236         avg_timings = avg( timings )
 237         print("avg_timings=%f, avg_sizes=%f, nblines=%d\n" % (avg_timings,avg_sizes,nblines))
 238
 239         S_XY= cov( sizes, timings )
 240         S_X2 = variance(  sizes )
 241         S_Y2 = variance(  timings )  # to compute correlation
 242
 243         a = S_XY/S_X2;
 244         correl = S_XY/(sqrt(S_X2)*sqrt(S_Y2))  # corealation coeff (Bravais-Pearson)
 245
 246
 247         b= avg_timings - a * avg_sizes
 248         print("[S_XY=%f, S_X2=%f]\n[correlation=%f, coeff det=%f]\n[a=%f, b=%f]\n" % (S_XY, S_X2, correl,correl**2,a, b))
 249
 250