3 # Copyright (c) 2010-2011, 2014. The SimGrid Team.
6 # This program is free software; you can redistribute it and/or modify it
7 # under the terms of the license (GNU LGPL) which comes with this package.
9 #---------------------------------------------------------------------------------------------------
11 # % ./regress.py griffon_skampi_pt2pt.ski.dat 65536 120832
14 # Given two vectors of same length n: message size S(.. s_i ..), and communication time T( .. t_i .. )
15 # where t_i is the time associated to a mesage size s_i, computes the segmentation of the vectors
16 # in 3 segments such that linear regressions on the 3 segments maximize correlation.
17 # The metric for correlation is now the cumulative, log error of the estimated time value given
18 # by regression, to the real time value.
20 # Call r(X[i:j],Y[i:j]) = ax + b the regression line for data of X,Y between indices i and j
21 # Call E[i:j] = E( e_i, .., e_j) the vector of estimates from the regression, where e_i = a*S[i] + b
22 # Call mean_logerr( T,T' ) the average log error of paired elements t_i and t'_i, of vectors len n T and T' resp.
23 # i.e mean_logerr( T,T' ) = 1/n * Sum_i^n ( exp(| ln(t_i)-ln(t'_i)|) - 1 )
25 # The script computes indices k and l, s.t.
26 # mean_logerr( r(S[0:k],T[0:k]) , E[0,k] ) +
27 # mean_logerr( r(S[k:l],T[k:l]) , E[k,l] ) +
28 # mean_logerr( r(S[l:n],T[l:n]) , E[l,n] )
30 #---------------------------------------------------------------------------------------------------
33 from math import sqrt,log,exp
36 if len(sys.argv) != 2 and len(sys.argv) != 4:
37 print("Usage : {} datafile".format(sys.argv[0]))
38 print("or : {0} datafile p1 p2".format(sys.argv[0]))
39 print("where : p1 < p2 belongs to sizes in datafiles")
42 if len(sys.argv) == 4:
46 ##-----------------------------------------
47 ## avg : return average of a list of values
48 ## param l list of values
49 ##-----------------------------------------
56 ##-------------------------------------------------
58 ## param X first data vector (..x_i..)
59 ## param Y second data vector (..y_i..)
60 ## = 1/n \Sum_{i=1}^n (x_i - avg(x)) * (y_i - avg(y))
61 ##--------------------------------------------------
63 assert(len(X)==len(Y))
64 n=len(X) # n=len(X)=len(Y)
69 S_XY = S_XY + ((X[i]-avg_X)*(Y[i]-avg_Y))
74 ##----------------------------------
75 ## variance : variance
76 ## param X data vector ( ..x_i.. )
77 ## (S_X)^2 = (Sum ( x_i - avg(X) )^2 ) / n
78 ##----------------------------------
84 S_X2 = S_X2 + ((X[i] - avg_X)**2)
88 ##----------------------------------
90 ## param X data vector ( ..x_i.. ), length n
91 ## param Y data vector ( ..y_i.. ), length n
92 ## return mean( 1/n * Sum_i^n ( exp(| ln(x_i)-ln(y_i)|) - 1 )
93 ##----------------------------------
94 def mean_logerr( X,Y ):
95 assert( len(X) == len(Y) )
96 E = list(); # the list of errors
97 for i in range(len(X)):
98 E.append( exp(abs(log(X[i])-log(Y[i])))-1 )
102 ##-----------------------------------------------------------------------------------------------
103 ## correl_split_weighted_logerr : compute regression on each segment and
104 ## return the weigthed sum of correlation coefficients
105 ## param X first data vector (..x_i..)
106 ## param Y second data vector (..x_i..)
107 ## param segments list of pairs (i,j) where i refers to the ith value in X, and jth value in X
108 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
109 ## where i1,j1 is the first segment, c1 the correlation coef on this segment, n1 the number of values
110 ## i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
112 ## and C=c1/n1+c2/n2+...
113 ##-----------------------------------------------------------------------------------------------
114 def correl_split_weighted_logerr( X , Y , segments ):
115 # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
117 interv = list() # regr. line coeffs and range
119 for (start,stop) in segments:
122 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
123 S_X2 = variance( X [start:stop+1] )
124 a = S_XY/S_X2 # regr line coeffs
125 b = avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
126 # fill a vector (Z) with predicted values from regression
128 for i in range(start,stop+1):
129 Z.append( a * X[i] + b )
130 # compare real values and computed values
131 e = mean_logerr( Y[start:stop+1] , Z )
132 correl.append( (e, stop-start+1) ); # store correl. coef + number of values (segment length)
133 interv.append( (a,b, X[start],X[stop],e) );
136 glob_err = glob_err + (e*l/len( X )) # the average log err for this segment (e) is
137 # weighted by the number of values of the segment (l) out of the total number of values
139 #print("-> glob_corr={}\n".format(glob_corr))
140 return (glob_err,interv);
142 ##-----------------------------------------------------------------------------------------------
143 ## correl_split_weighted : compute regression on each segment and
144 ## return the weigthed sum of correlation coefficients
145 ## param X first data vector (..x_i..)
146 ## param Y second data vector (..x_i..)
147 ## param segments list of pairs (i,j) where i refers to the ith value in X, and jth value in X
148 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
149 ## where i1,j1 is the first segment, c1 the correlation coef on this segment, n1 the number of values
150 ## i2,j2 is the second segment, c2 the correlation coef on this segment, n2 the number of values
152 ## and C=c1/n1+c2/n2+...
153 ##-----------------------------------------------------------------------------------------------
154 def correl_split_weighted( X , Y , segments ):
155 # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
157 interv = list(); # regr. line coeffs and range
160 for (start,stop) in segments:
161 sum_nb_val = sum_nb_val + stop - start;
164 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
165 S_X2 = variance( X [start:stop+1] )
166 S_Y2 = variance( Y [start:stop+1] ) # to compute correlation
169 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
170 a = S_XY/S_X2 # regr line coeffs
171 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
172 #print(" range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
173 correl.append( (c, stop-start) ); # store correl. coef + number of values (segment length)
174 interv.append( (a,b, X[start],X[stop]) );
177 glob_corr = glob_corr + (l/sum_nb_val)*c # weighted product of correlation
178 #print('-- %f * %f' % (c,l/sum_nb_val))
180 #print("-> glob_corr={}\n".format(glob_corr))
181 return (glob_corr,interv);
186 ##-----------------------------------------------------------------------------------------------
187 ## correl_split : compute regression on each segment and
188 ## return the product of correlation coefficient
189 ## param X first data vector (..x_i..)
190 ## param Y second data vector (..x_i..)
191 ## param segments list of pairs (i,j) where i refers to the ith value in X, and jth value in X
192 ## return (C,[(i1,j1,X[i1],X[j1]), (i2,j2,X[i2],X[j2]), ....]
193 ## where i1,j1 is the first segment, c1 the correlation coef on this segment,
194 ## i2,j2 is the second segment, c2 the correlation coef on this segment,
197 ##-----------------------------------------------------------------------------------------------
198 def correl_split( X , Y , segments ):
199 # expects segments = [(0,i1-1),(i1-1,i2-1),(i2,len-1)]
201 interv = list(); # regr. line coeffs and range
203 for (start,stop) in segments:
206 S_XY= cov( X [start:stop+1], Y [start:stop+1] )
207 S_X2 = variance( X [start:stop+1] )
208 S_Y2 = variance( Y [start:stop+1] ) # to compute correlation
211 c = S_XY/(sqrt(S_X2)*sqrt(S_Y2))
212 a = S_XY/S_X2 # regr line coeffs
213 b= avg ( Y[start:stop+1] ) - a * avg( X[start:stop+1] )
214 #print(" range [%d,%d] corr=%f, coeff det=%f [a=%f, b=%f]" % (X[start],X[stop],c,c**2,a, b))
215 correl.append( (c, stop-start) ); # store correl. coef + number of values (segment length)
216 interv.append( (a,b, X[start],X[stop]) );
219 glob_corr = glob_corr * c # product of correlation coeffs
220 return (glob_corr,interv);
224 ##-----------------------------------------------------------------------------------------------
226 ##-----------------------------------------------------------------------------------------------
229 skampidat = open(sys.argv[1], "r")
232 ## read data from skampi logs.
236 for line in skampidat:
238 if line[0] != '#' and len(l) >= 3: # is it a comment ?
241 #count= 8388608 8388608 144916.1 7.6 32 144916.1 143262.0
242 #("%s %d %d %f %f %d %f %f\n" % (countlbl, count, countn, time, stddev, iter, mini, maxi)
243 readdata.append( (int(l[1]),float(l[3])) );
246 ## These may not be sorted so sort it by message size before processing.
247 sorteddata = sorted( readdata, key=lambda pair: pair[0])
248 sizes,timings = zip(*sorteddata);
250 # zip makes tuples; cast to lists for backward compatibility with python2.X
252 timings = list(timings)
254 ##----------------------- search for best break points-----------------
256 ## p1=2048 -> p1inx=11 delta=3 -> [8;14]
257 ## 8 : segments[(0,7),(8,13),(13,..)]
259 ## p2=65536 -> p2inx=16 delta=3 -> [13;19]
261 if len(sys.argv) == 4:
263 p1inx = sizes.index( p1 );
264 p2inx = sizes.index( p2 );
265 max_glob_corr = 999990;
269 ## tweak parameters here to extend/reduce search
270 search_p1 = 70 # number of values to search +/- around p1
271 search_p2 = 70 # number of values to search +/- around p2
273 if (search_p2 + min_seg_size > len(sizes)): # reduce min segment sizes when points close to data extrema
274 min_seg_size = len(sizes)-search_p2
275 if (search_p1 - min_seg_size < 0):
276 min_seg_size = search_p1
278 lb1 = max( 1, p1inx-search_p1 )
279 ub1 = min( p1inx+search_p1, p2inx);
280 lb2 = max( p1inx, p2inx-search_p2) # breakpoint +/- delta
281 ub2 = min( p2inx+search_p2, len(sizes)-1);
283 print("** evaluating over \n");
284 print("interv1:\t %d <--- %d ---> %d" % (sizes[lb1],p1,sizes[ub1]))
285 print("rank: \t (%d)<---(%d)--->(%d)\n" % (lb1,p1inx,ub1))
286 print("interv2:\t\t %d <--- %d ---> %d" % (sizes[lb2],p2,sizes[ub2]))
287 print("rank: \t\t(%d)<---(%d)--->(%d)\n" % (lb2,p2inx,ub2))
290 for i in range(lb1,ub1+1):
291 for j in range(lb2,ub2+1):
292 if i<j: # segments must not overlap
293 if i+1 >=min_seg_size and j-i+1 >= min_seg_size and len(sizes)-1-j >= min_seg_size : # not too small segments
294 #print("** i=%d,j=%d" % (i,j))
295 segments = [(0,i),(i,j),(j,len(sizes)-1)]
296 result.append( correl_split_weighted_logerr( sizes, timings, segments) ) # add pair (metric,interval)
298 # sort results on ascending metric: ok for logerr. Add "reverse=true" for desc sort if you use a correlation metric
299 result = sorted( result, key=lambda pair: pair[0])
302 top_n_sol=5; # tweak to display top best n solution
304 print("#-------------------- result summary ---------------------------------------------------------------------\n");
306 for k in range(top_n_sol):
307 (err,interval) = result[k]
309 print("\n RANK {0}\n-------".format(k))
310 print("** overall metric = {0}".format(err))
311 for (a,b,i,j,e) in interval:
312 print("** OPT: [{0} .. {1}] segment_metric={2} slope: {3} x + {4}".format(i,j,e,a,b))
317 print("#-------------------- Best Solution : cut here the gnuplot code -----------------------------------------------------------\n");
318 preamble='set output "regr.eps"\n\
319 set terminal postscript eps color\n\
321 set xlabel "Each message size in bytes"\n\
322 set ylabel "Time in us"\n\
328 print('plot "%s" u 3:4:($5) with errorbars title "skampi traces %s",\\' % (sys.argv[1],sys.argv[1]));
329 (err,interval) = result[0]
330 for (a,b,i,j,e) in interval:
331 print('"%s" u (%d<=$3 && $3<=%d? $3:0/0):(%f*($3)+%f) w linespoints title "regress. %s-%s bytes",\\' % (sys.argv[1],i,j,a,b,i,j))
333 print("#-------------------- /cut here the gnuplot code -----------------------------------------------------------\n");
337 print('\n** Linear regression on %d values **\n' % (nblines))
338 print('\n sizes=',sizes,'\n\n')
339 avg_sizes = avg( sizes )
340 avg_timings = avg( timings )
341 print("avg_timings=%f, avg_sizes=%f, nblines=%d\n" % (avg_timings,avg_sizes,nblines))
343 S_XY= cov( sizes, timings )
344 S_X2 = variance( sizes )
345 S_Y2 = variance( timings ) # to compute correlation
348 correl = S_XY/(sqrt(S_X2)*sqrt(S_Y2)) # corealation coeff (Bravais-Pearson)
351 b= avg_timings - a * avg_sizes
352 print("[S_XY=%f, S_X2=%f]\n[correlation=%f, coeff det=%f]\n[a=%f, b=%f]\n" % (S_XY, S_X2, correl,correl**2,a, b))