1 """Fractal dimension estimates for analysis of datasets having rank 2 or 3.
2
3 A journal publication concerning the use of these functions appears in
4 IEEE Transactions on BioMedical Engineering, 2008.
5
6 (c) 2005, 2006. Robert Clewley, John Guckenheimer.
7 """
8
9 from __future__ import division
10 from PyDSTool import *
11 from PyDSTool.common import _seq_types
12 from PyDSTool.Toolbox.data_analysis import *
13 from numpy import extract, mean, std, ndarray
14 from scipy.linalg import norm
15 from PyDSTool.matplotlib_import import *
16 from matplotlib import cm
17 from time import localtime
18 from copy import copy
19
20 __all__ = ['corr_dim', 'corr_dim_with_progress',
21 'find_cover', 'find_outliers', 'get_cover_radii',
22 'do_stats', 'nhd', 'timeseq', 'get_rV_curves',
23 'get_filtered_ixs', 'plot_radius_distribution', 'filter_by_radius',
24 'plot_dim_by_thickness', 'plot_thickness_by_dim', 'find_nhd',
25 'prep_secant_figure', 'PD_E', 'slope_range', 'scatterplot_slopes',
26 'scatter_histo', 'rescatter', 'sorted_by_slope', 'sorted_by_radius']
27
28
29
30
32 """Correlation dimension of a data set.
33 """
34 npts = size(data,0)
35 d = zeros((npts*(npts-1)/2,), 'd')
36
37 for i in xrange(1,npts):
38 for j in xrange(i-1):
39 d[(i-1)*(i-2)/2+j] = norm(data[i]-data[j], which_norm)
40 d.sort()
41 logd = log(d).ravel()
42 logv = log(range(1,len(d)+1))
43 nan_ixs = isfinite(logd)
44 logd = extract(nan_ixs, logd)
45 logv = extract(nan_ixs, logv)
46 return (logv, logd)
47
48
50 """Version of correlation dimension with text file progress dump,
51 at a sample rate of 'inforate', because the calculation typically
52 takes a long time.
53 """
54 npts = size(data,0)
55 dsize = npts*(npts-1)/2
56 d = zeros((dsize,), 'd')
57
58 for i in xrange(1,npts):
59 for j in xrange(i-1):
60 ix = (i-1)*(i-2)/2+j
61 if mod(ix,inforate) == 0:
62 dump_progress(ix, dsize)
63 d[ix] = norm(data[i]-data[j])
64 d.sort()
65 logd = log(d).ravel()
66 logv = log(range(1,len(d)+1))
67 nan_ixs = isfinite(logd)
68 logd = extract(nan_ixs, logd)
69 logv = extract(nan_ixs, logv)
70 return (logv, logd)
71
72
74 """Extract time sequence from a covering"""
75 ts = time.take(covering[refpt][3], 0)
76 print "time sequence min = %f, max = %f"%(min(ts),max(ts))
77 return ts
78
79
80
81
82
83 -def find_cover(data, remain_tol=5,
84 step=None, initref=None, wlo=1, whi=1,
85 check_large_nhds=True, large_nhd=300,
86 min_nhd_size=50, min_ix=25,
87 max_res=1.0, max_std=0.2, quiet=True):
88 """
89 OUTPUTS:
90
91 covering is a dict:
92 reference point -> (dim, low radius, high radius, covered_indices)
93 covered is a dict:
94 point -> reference points whose neighbourhoods include it
95 """
96 covering = {}
97 covered = {}.fromkeys(range(len(data)),None)
98 not_covered = covered.keys()
99 if remain_tol > len(data):
100 remain_tol = 0
101 makerefpt_ixs = []
102 if initref is not None:
103 refptix = initref
104 else:
105 refptix = 0
106 N = size(data,0)
107 start_ix = int(round(sqrt(N)))
108 print "N = ", N, "start ix = ", start_ix
109 logv_raw = log(range(1,N+1))
110 done = False
111 unsuccessful = []
112 if step is None:
113 ixstep = max([int(around(len(data)/2000.)),1])
114 elif step > 0:
115 if step < 1:
116
117 ixstep = int(math.ceil(len(d)*step))
118 if not quiet:
119 print "Using index step %i out of %i points"%(ixstep, len(d))
120 else:
121 ixstep = step
122 else:
123 raise ValueError("Invalid step argument")
124 while not done:
125 if not quiet:
126 print "\n*************\nCalculating distances and point-wise dimension around pt %i ..."%refptix
127 logv, logd, d, d_inv_dict = log_distances_with_D(data, refptix, logv_raw)
128 d_inv_keys = sortedDictKeys(d_inv_dict)
129 d_inv_vals = sortedDictValues(d_inv_dict)
130 d_inv = Pointset(indepvararray=d_inv_keys, indepvartype=Float,
131 coordarray=d_inv_vals, coordtype=Int, tolerance=5e-5)
132 if not quiet:
133 print "Finding self-consistent neighbourhood",
134
135
136
137 err=-1
138 len_logd = len(logd)
139 old_dim = None
140 hiix = start_ix
141 if not quiet:
142 print "Start ix = ", start_ix
143 loix=start_ix-2
144 nhd_too_small = False
145 ref_dim = -1
146 all_res = []
147 in_zone = False
148
149 while not in_zone and loix > min_ix+ixstep:
150 loix -= ixstep
151 pfit=polyfit(logd[loix:hiix],logv[loix:hiix],1)
152 dim = pfit[0]
153 residual = get_linear_regression_residual(pfit, logd[loix:hiix],logv[loix:hiix],
154 weight='')
155 all_res.append(residual)
156 if not quiet and mod(hiix,50) == 0:
157 print ".",
158 if old_dim is not None:
159 sd = std(all_res)
160
161
162 if sd > max_std:
163 in_zone = True
164 old_dim = dim
165
166 all_res = []
167 not_done = True
168 while not_done and hiix < len_logd-1-ixstep:
169 hiix += ixstep
170
171 pfit=polyfit(logd[loix:hiix],logv[loix:hiix],1)
172 dim = pfit[0]
173 residual = get_linear_regression_residual(pfit, logd[loix:hiix],logv[loix:hiix],
174 weight='lo', w=wlo)
175 all_res.append(residual)
176 if not quiet:
177 print "(%i,%i)"%(loix,hiix),
178 print "dim=%.4f, residual = %.4f"%(dim,residual)
179 if not quiet and mod(hiix,50) == 0:
180 print ".",
181 try:
182 sd = std(all_res)
183 except:
184 sd = 0
185
186
187
188
189 if sd > max_std:
190 if not quiet:
191 print "residuals s.d. > max_std"
192 not_done = False
193 old_dim = dim
194 nhd_too_small = hiix-loix < min_nhd_size
195 if not quiet:
196 print "nhd_too_small = ", nhd_too_small
197 if nhd_too_small:
198 if not quiet:
199 print "Neighbourhood too small. Moving to a different reference point..."
200 unsuccessful.append(refptix)
201 else:
202 if not quiet:
203 print "\nDimension = %f"%dim
204 print "Found best fit line from relative ix %i to %i (radius %f)"%(loix, hiix, d[hiix])
205
206 covered_ixs = [d_inv(d[ix])[0] for ix in range(loix, hiix+1)]
207 covered_ixs.append(refptix)
208
209
210 if hiix-loix > large_nhd and check_large_nhds:
211 num_recheck = int(around((hiix-loix)/large_nhd))
212 for i in range(num_recheck):
213 ix = random.randrange(0, hiix-loix)
214 if ix not in makerefpt_ixs and ix != refptix:
215 makerefpt_ixs.append(ix)
216 covering[refptix] = (dim, d[loix], d[hiix], covered_ixs)
217 for ix in covered_ixs:
218 if covered[ix] is None:
219 covered[ix] = [refptix]
220 else:
221 covered[ix].append(refptix)
222 not_covered = remain(not_covered, covered_ixs)
223
224
225
226 num_uncovered = len(not_covered)
227
228 print "%i points left to cover"%(num_uncovered+len(makerefpt_ixs))
229 if num_uncovered < remain_tol:
230 if len(makerefpt_ixs) == 0:
231 done = True
232 else:
233
234 refptix = makerefpt_ixs.pop()
235 check_large_nhds = False
236 else:
237 pick_unsuccessful = 0
238 while pick_unsuccessful < num_uncovered and pick_unsuccessful >= 0:
239 refptix = not_covered[random.randrange(0,num_uncovered)]
240 if refptix in unsuccessful:
241 pick_unsuccessful += 1
242 else:
243 pick_unsuccessful = -1
244 if pick_unsuccessful == num_uncovered:
245 done = True
246 return (covered, covering)
247
248
249
250 -def find_nhd(data, tol_up=.2, tol_down=0.05, quiet=True, max_res=0.2, max_std=0.1,
251 step=None, initref=None, min_ix=10, min_nhd_size=20,
252 max_radius_ratio=1.5):
253 """Find a single neighbourhood around a reference point."""
254 covering = {}
255 covered = {}.fromkeys(range(len(data)),None)
256 if initref is not None:
257 refptix = initref
258 else:
259 refptix = 0
260 logv_raw = log(range(1,size(data,0)+1))
261 if not quiet:
262 print "\n*************\nCalculating distances and point-wise dimension around pt %i ..."%refptix
263 doplot = 1
264 else:
265 doplot = 0
266 logv, logd, d, d_inv_dict = log_distances_with_D(data, refptix, logv_raw)
267 d_inv_keys = sortedDictKeys(d_inv_dict)
268 d_inv_vals = sortedDictValues(d_inv_dict)
269 d_inv = Pointset(indepvararray=d_inv_keys, indepvartype=Float,
270 coordarray=d_inv_vals, coordtype=Int, tolerance=5e-5)
271 if not quiet:
272 print "Finding self-consistent neighbourhood",
273
274
275
276 err=-1
277 len_logd = len(logd)
278 old_dim = None
279 loix=min_ix
280 hiix=loix+1
281 nhd_too_small = (len_logd < min_ix+10)
282 if step is None:
283 ixstep = max([int(around(len(data)/2000.)),1])
284 elif step > 0:
285 if step < 1:
286
287 ixstep = int(math.ceil(len(d)*step))
288 if not quiet:
289 print "Using index step %i out of %i points"%(ixstep, len(d))
290 else:
291 ixstep = step
292 else:
293 raise ValueError("Invalid step argument")
294 if not quiet:
295 plot(logd,logv)
296 not_done = not nhd_too_small
297 in_zone = False
298 ref_dim = -1
299 log_max_radius_ratio = log(max_radius_ratio)
300 all_res = []
301 while not_done and hiix < len_logd-1-ixstep:
302 hiix += ixstep
303
304 pfit = polyfit(logd[loix:hiix],logv[loix:hiix],1)
305 dim = pfit[0]
306 residual = get_linear_regression_residual(pfit, logd[loix:hiix],logv[loix:hiix])
307
308 all_res.append(residual)
309 if not quiet:
310 print "Dim = %.3f in [%i,%i], Residual = %.4f "%(dim,loix,hiix,residual),
311 plot([logd[loix],logd[hiix]],[logd[loix]*pfit[0]+pfit[1],logd[hiix]*pfit[0]+pfit[1]])
312
313
314 if old_dim is not None:
315
316 sd = std(all_res)
317 if not quiet:
318 print "S.d. = %.4f "%sd
319 if residual > max_res:
320 print "residual > max_res"
321 not_done = False
322 if sd > max_std:
323 print "residuals s.d. > max_std"
324 not_done = False
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344 old_residual = residual
345 old_dim = dim
346 nhd_too_small = nhd_too_small or hiix-loix < min_nhd_size
347 if nhd_too_small:
348 print "Neighbourhood too small. Try a different starting index or a new reference point ..."
349 print "Dim found over ixs [%i, %i] = %.4f"%(loix,hiix,dim)
350 raise RuntimeError
351 else:
352 if not quiet:
353 print "\nDimension = %f"%dim
354 print "Found best fit line from relative ix %i to %i (radius %f)"%(loix, hiix, d[hiix])
355
356 covered_ixs = [d_inv(d[ix])[0] for ix in range(loix, hiix+1)]
357 covered_ixs.append(refptix)
358 covering[refptix] = (dim, d[loix], d[hiix], covered_ixs)
359 for ix in covered_ixs:
360 if covered[ix] is None:
361 covered[ix] = [refptix]
362 else:
363 covered[ix].append(refptix)
364 return (covered, covering)
365
366
367
368 -def do_stats(covering, covered, maxD, bin_width=1,
369 fignum=1, num_panels=4, s_frac=0.5,
370 nhd_size_thresh=None, nhd_max_plot_size=None, minD=None,
371 D_estimate_method="absolute", weight_by_size=False,
372 force_xaxis_limits=False, radius_yaxis_limits=None,
373 save=''):
374 """Produces statistics of pointwise dimension estimates of a data set,
375 including a histogram of the distribution of neighbourhood dimensions.
376
377 R. Clewley, June 2006.
378
379 INPUTS:
380
381 maxD sets the maximum dimension to cater for.
382
383 bin_width selects the width of bins for dimensions (default 1).
384
385 fignum forces a particular figure number to be used for display.
386 (Use fignum=0 to suppress plotting of the results and just return
387 the statistics.)
388
389 num_panels argument must be 2 (for just nhd size and bin histo) or
390 4 (also include min and max nhd radius).
391
392 s_frac selects the fraction (or multiple) of the standard deviation of
393 neighbourhood sizes to use as the cutoff for the "bulk" of the data
394 set, in order to avoid including large outliers in determining
395 nhd_size_threshold (default 0.5).
396
397 nhd_size_threshold selects the smallest neighbourhood size to be
398 included in histogram of neighbourhood dimensionalities
399 (default None => automatic selection, i.e., the mean of the first
400 s_frac of the neighbourhood sizes' standard deviation).
401 Set to zero to prevent it being used or plotted.
402
403 nhd_max_plot_size selects the y-axis limit on the plot of neighbourhood
404 sizes (default None => automatic selection).
405
406 D_estimate_method can be 'absolute' (uses cutoff of size = 1 to estimate D
407 in histogram tail), or 'relative' (uses a cutoff = %age of peak binned
408 neighbourhoods)
409
410 force_xaxis_limits (Boolean) determines whether x-axis limits should be
411 forced to be the value of the (minD,maxD) parameters.
412
413 minD sets the minimum dimension to cater for (default to auto-choice).
414
415 radius_yaxis_limits (pair) sets the r_min and r_max y-axis upper limits,
416 respectively.
417
418 save (string) sets a filename to use for saving the figure in available
419 formats (so don't include a file extension).
420
421
422 OUTPUTS:
423
424 d_stats: dictionary {"D_tail": estimated D from tail,
425 "D_mean": histo mean,
426 "D_std": histo std dev}
427
428 dbins: Pointset of D bins
429
430 cover_by_dimix: dictionary of covers by index of dimensions binned
431 (i.e., [0, bin_width, 2*bin_width, ..., maxD+1])
432
433 cover_by_size: list of neighbourhood coverings ordered by their size
434
435 cbu: array indicating how many points in overlap between neighbourhoods
436 (array index corresponds to a number of points shared,
437 value = number of nhds with that number of points shared)
438
439 ixs: list of indices of neighbourhoods of size > nhd_size_threshold
440
441 (integral: internal diagnostic feature)
442 """
443 dx = zeros((len(covering),1),'f')
444 ly = zeros((len(covering),1),'f')
445 rloy = zeros((len(covering),1),'f')
446 rhiy = zeros((len(covering),1),'f')
447 drange = arange(0, maxD+1, bin_width, 'd').tolist()
448 dbins = data_bins('D', drange)
449 cover_by_dimix = {}.fromkeys(range(len(drange)))
450 for dix in range(len(drange)):
451 cover_by_dimix[dix] = []
452 cover_by_size_dict = {}.fromkeys(range(len(covered)+1))
453 ix = 0
454 integral = 0
455 largest = 0
456 try:
457 for p, (d, rlo, rhi, l) in covering.iteritems():
458 dx[ix] = d
459 rloy[ix] = rlo
460 rhiy[ix] = rhi
461 lenl = len(l)
462 if lenl > largest:
463 largest = lenl
464 try:
465 cover_by_size_dict[lenl].append(p)
466 except AttributeError:
467 cover_by_size_dict[lenl] = [p]
468 ly[ix] = lenl
469 integral += lenl
470 ix += 1
471 except ValueError:
472
473
474 for p, (d, rlo, l) in covering.iteritems():
475 dx[ix] = d
476 rloy[ix] = rlo
477 rhiy[ix] = 0
478 lenl = len(l)
479 if lenl > largest:
480 largest = lenl
481 try:
482 cover_by_size_dict[lenl].append(p)
483 except AttributeError:
484 cover_by_size_dict[lenl] = [p]
485 ly[ix] = lenl
486 integral += lenl
487 ix += 1
488 print "\nNeighbourhood statistics:"
489 print "There are %i neighbourhoods to this covering"%len(covering)
490 print "Largest neighbourhood has %i points"%largest
491 cover_by_size = [di for di in cover_by_size_dict.iteritems() if di[1] is not None]
492 cover_by_size.sort()
493 csizes = [c[0] for c in cover_by_size]
494 if nhd_size_thresh is None:
495 s = std(csizes)
496 sm = s_frac*s
497 m = mean(csizes)
498 print "Std. dev. of cover_by_size =", s
499 print "Max size found =", max(csizes)
500 print "Mean size found =", m
501
502
503
504
505
506
507
508
509
510
511
512 nhd_size_thresh = m-sm
513 print "N'hood size threshold used = mean - %.3f of std. dev."%s_frac
514 print " =", nhd_size_thresh
515 else:
516 if nhd_size_thresh > 0:
517 print "N'hood size threshold set by user =", nhd_size_thresh
518 if nhd_max_plot_size is None:
519
520
521
522
523
524
525 nhd_max_plot_size = largest
526 print "Using max plot nhd size of largest neighbourhood found =", nhd_max_plot_size
527 else:
528 print "Using max plot nhd size set by user =", nhd_max_plot_size
529
530 cover_by_size.reverse()
531 filtered_ixs = []
532 try:
533 for p, (d, rlo, rhi, l) in covering.iteritems():
534 if d <= maxD:
535 dix = dbins.resolve_bin_index(d)
536 if len(l) > nhd_size_thresh:
537 if weight_by_size:
538 inc = len(l)/largest
539 else:
540 inc = 1
541 dbins.increment(d, inc)
542 filtered_ixs.append(p)
543 cover_by_dimix[dix].append(p)
544 except ValueError:
545
546 for p, (d, rlo, l) in covering.iteritems():
547 if d <= maxD:
548 dix = dbins.resolve_bin_index(d)
549 if len(l) > nhd_size_thresh:
550 if weight_by_size:
551 inc = len(l)/largest
552 else:
553 inc = 1
554 dbins.increment(d, inc)
555 filtered_ixs.append(p)
556 cover_by_dimix[dix].append(p)
557
558 ks=dbins.midpoints
559 d_est = 0
560 if D_estimate_method == "absolute":
561
562
563 if len(covering) < 3:
564 dbin_thresh = 0
565 else:
566 dbin_thresh = 1
567 for k in ks:
568 n = dbins(k)
569 if d_est > 0 and n == 0:
570
571 break
572 if n > dbin_thresh:
573 d_est = k
574 elif D_estimate_method == "relative":
575 peak = 0
576 last_peak = 0
577 for k in ks:
578 n = dbins(k)
579 if n > peak and n > 1:
580 peak = n
581 last_peak = n
582 if d_est > 0 and n == 0:
583
584 break
585 if peak > 0:
586
587 if n > .2*last_peak:
588
589 d_est = k
590
591 last_peak = n
592 else:
593 raise ValueError("Invalid dimension estimation method name")
594 if fignum > 0:
595 fontmath = args(fontsize=22,fontname='Times')
596 fonttext = args(fontsize=18,fontname='Times')
597
598 print "Plotting histograms to figure", fignum
599 assert num_panels in [2,4], "num_panels argument must be 2 or 4"
600 figure(fignum)
601 if num_panels == 4:
602 subplot(2,2,1)
603 plot(dx, rhiy, 'ro')
604 glines = getp(gca(), 'xticklines')+getp(gca(), 'yticklines')
605 setp(glines, 'linewidth', 2)
606
607 xlabel(r'$\rm{Dimension}$', fonttext)
608 ylabel(r'$r_{max}$', fontmath)
609 if num_panels == 4:
610 subplot(2,2,2)
611 else:
612 subplot(2,1,1)
613 plot(dx, ly, 'ko')
614 glines = getp(gca(), 'xticklines')+getp(gca(), 'yticklines')
615 setp(glines, 'linewidth', 2)
616 dmin, dmax = figure(fignum).axes[0].get_xlim()
617 if dmax > maxD or force_xaxis_limits:
618 dmax = maxD
619 if dmin < minD or force_xaxis_limits:
620 if minD is not None:
621 dmin = minD
622 if nhd_size_thresh > 0:
623 plot([dmin,dmax],[nhd_size_thresh,nhd_size_thresh],'k--',linewidth=1.2)
624
625 xlabel(r'$\rm{Dimension}$', fonttext)
626 ylabel(r'$\rm{N-hood \ size}$', fonttext)
627 if num_panels == 4:
628 subplot(2,2,3)
629 plot(dx, rloy, 'go')
630 glines = getp(gca(), 'xticklines')+getp(gca(), 'yticklines')
631 setp(glines, 'linewidth', 2)
632
633 xlabel(r'$\rm{Dimension}$', fonttext)
634 ylabel(r'$r_{min}$', fontmath)
635 if num_panels == 4:
636 subplot(2,2,4)
637 else:
638 subplot(2,1,2)
639 dbx = dbins.midpoints.tolist()
640 dby = dbins.values()
641 bar(dbx, dby, color='b', width=dbx[1]-dbx[0])
642 for panel in range(num_panels):
643 figure(fignum).axes[panel].set_xlim([dmin,dmax])
644
645 xlabel(r'$\rm{Dimension}$', fonttext)
646 ylabel(r'$\rm{Number \ binned}$',fonttext)
647
648
649
650 cbins = zeros((100,),'i')
651 max_used = 0
652 for clist in covered.itervalues():
653 try:
654 n = len(clist)
655 except TypeError:
656
657 n = 0
658 if n > max_used:
659 max_used = n
660 if n < maxD+1:
661 cbins[n] = cbins[n] + 1
662
663 cbu = cbins[0:max_used+1]
664 if fignum > 0:
665
666
667
668
669
670 if num_panels == 4:
671 figure(fignum).axes[0].set_position([0.1, 0.55, 0.37, 0.35])
672 figure(fignum).axes[1].set_position([0.57, 0.55, 0.37, 0.35])
673 figure(fignum).axes[2].set_position([0.1, 0.09, 0.37, 0.35])
674 figure(fignum).axes[3].set_position([0.57, 0.09, 0.37, 0.35])
675 figure(fignum).axes[1].set_ylim([0,nhd_max_plot_size])
676 if radius_yaxis_limits is not None:
677 assert radius_yaxis_limits[1] >= radius_yaxis_limits[0], \
678 "Specify radius y-axis limits in increasing order"
679 ylo1 = figure(fignum).axes[0].get_ylim()[0]
680 ylo0 = figure(fignum).axes[2].get_ylim()[0]
681 figure(fignum).axes[0].set_ylim([ylo1,radius_yaxis_limits[1]])
682 figure(fignum).axes[2].set_ylim([ylo0,radius_yaxis_limits[0]])
683 else:
684 figure(fignum).axes[0].set_position([0.125, 0.57, 0.775, 0.364])
685 figure(fignum).axes[1].set_position([0.125, 0.08, 0.775, 0.364])
686 figure(fignum).axes[0].set_ylim([0,nhd_max_plot_size])
687 draw()
688 if save != '':
689 savefig(save+'.png')
690 savefig(save+'.svg')
691 savefig(save+'.eps')
692
693
694 tail_ix = dbins.resolve_bin_index(d_est)
695 if isinstance(tail_ix, tuple):
696 tail_ix = tail_ix[1]
697 try:
698 m = dbins[:tail_ix].mean()
699 s = dbins[:tail_ix].std()
700 except:
701 print "Tail found at index %i and D = %.3f"%(tail_ix,d_est)
702 print " -- problem computing mean and std dev for this mode"
703 print " so calculating for whole data set"
704 m = dbins.mean()
705 s = dbins.std()
706 print "Estimate dimension to be (to resolution of histogram bin width):", d_est
707 print " with histogram mean D = %.5f and std dev D = %.5f"%(m, s)
708 return {"D_tail": d_est, "D_mean": m, "D_std": s}, \
709 dbins, cover_by_dimix, cover_by_size, cbu, filtered_ixs, integral
710
711
713 """Returns the list of indices in covering for which the neighbourhood
714 size was larger than the threshold for dimension binning."""
715 fixs=[]
716 for p, ixlist in cover_by_dimix.iteritems():
717 fixs.extend(ixlist)
718 fixs.sort()
719 return fixs
720
721
722 -def find_outliers(data, plotdata, cluster_dim, cluster_width, max_central=50):
723 outliers = []
724 central = []
725 assert cluster_width > 0 and cluster_dim - cluster_width > 1
726 for i, (dlo, dhi, refix) in enumerate(plotdata):
727 refix = int(refix)
728 if dlo < 1:
729 outliers.append(refix)
730 if dlo > cluster_dim-cluster_width and dlo < cluster_dim+cluster_width:
731 if len(central) < max_central:
732 central.append(refix)
733 num_central = len(central)
734 central_data = array([data[cix] for cix in central])
735 if max_central > 0:
736 c_mean = mean(central_data,0)
737 else:
738 c_mean = central_data[0]
739 do = zeros((len(outliers),),float)
740 dc = zeros((num_central,),float)
741 for i, oix in enumerate(outliers):
742 do[i]=norm(c_mean-data[oix])
743 for i, cdata in enumerate(central_data):
744 dc[i]=norm(c_mean-cdata)
745 ds = []
746 for i in xrange(len(data)):
747 if i not in outliers:
748 ds.append(norm(data[i]-data[outliers[0]]))
749 ds = array(ds)
750 if num_central > 1:
751 mean_dc = mean(dc)
752 std_dc = std(dc)
753 else:
754 mean_dc = dc[0]
755 std_dc = 0
756
757 return (mean(do), std(do), mean_dc, std_dc, mean(ds), std(ds), do)
758
759
761 """Return the radius of a covering"""
762 if method == 'divide':
763 rfun = lambda rh, rl: rh/rl
764 elif method == 'subtract':
765 rfun = lambda rh, rl: rh-rl
766 radii = {}
767 if ixs is None:
768 for p, (d, rlo, rhi, l) in covering.iteritems():
769 radii[p] = rfun(rhi,rlo)
770 else:
771 for p, (d, rlo, rhi, l) in covering.iteritems():
772 if p in ixs:
773 radii[p] = rfun(rhi,rlo)
774 return radii
775
776
778 """Returns list of filtered (index, radius) pairs and corresponding
779 sub-coverings' dimensions"""
780 if rhi is None:
781 assert rlo is not None
782 test = lambda r: r > rlo
783 else:
784 if rlo is None:
785 test = lambda r: r < rhi
786 else:
787 test = lambda r: r > rlo and r < rhi
788 rfilt = [(ix,r) for (ix, r) in radii.iteritems() if test(r)]
789 return rfilt, [covering[rinfo[0]][0] for rinfo in rfilt]
790
791
793 """Expects radii dictionary, as returned by get_cover_radii.
794 Returns figure handle."""
795 rads = array(radii.values())
796 dbins = data_bins('r', arange(0, max_radius+radius_bin_width, radius_bin_width))
797 dbins.bin(rads)
798 dbx, dby = dbins.to_array()
799 f = figure()
800 bar(dbx,dby,width=rbin)
801 title('Distribution of nhd radii')
802 xlabel('r')
803 ylabel('#')
804 return f
805
806
808 """Plot of dimensions of neighbourhoods with thickness r, using
809 binned data of bin width rwidth. r is defined as the neighbourhood's
810 r_max - r_min.
811
812 If rwidth=0 (default) then no data binning is done.
813 Returns figure handle."""
814 rlo, rhi = extent(radii.values())
815 assert rlo > 0 and rhi > rlo
816 f=figure()
817 if rwidth > 0:
818 assert rwidth < rhi-rlo
819 for r in arange(0, rhi+rwidth, rwidth):
820 rs,ds=filter_by_radius(covering, radii, r, r+rwidth)
821 if ds != []:
822 plot([r], ds, plotstyle)
823 title('Distribution of dimension by radius bin (width '+str(rwidth)+')')
824 else:
825 rs,ds=filter_by_radius(covering, radii, 0, rhi)
826 if ds != []:
827 plot([r[1] for r in rs], ds, plotstyle)
828
829 xlabel(r'$r_{max} \ - \ r_{min}$',fontsize=22)
830 ylabel(r'$\rm{Dimension}$',fontsize=20)
831 return f
832
833
835 """Plot of thickness of neighbourhoods as a function of their
836 estimated dimension. r is defined as the neighbourhood's
837 r_max - r_min.
838
839 Returns figure handle."""
840 rlo, rhi = extent(radii.values())
841 assert rlo > 0 and rhi > rlo
842 f = figure()
843 rs, ds = filter_by_radius(covering, radii, 0, rhi)
844 if ds != []:
845 plot(ds, [r[1] for r in rs], plotstyle)
846 ylabel(r'$r_{max} \ - \ r_{min}$',fontsize=22)
847 xlabel(r'$\rm{Dimension}$',fontsize=20)
848 return f
849
850
851 -def nhd(data, covering, refpt):
852 return data.take(covering[refpt][3], 0)
853
854
855
856
858 """Get r-V curves for pointwise dimension estimate of data"""
859 if doplot:
860 figure()
861 rV = {}
862 for refpt in refpts:
863 if show_seq:
864 logv, logd, ixs_sorted = log_distances(data, refpt, doplot,
865 return_ixs=True)
866 else:
867 logv, logd = log_distances(data, refpt, doplot,
868 return_ixs=False)
869 if show_seq:
870
871 notinseq = out_of_seq(ixs_sorted)
872 inseq = remain(ixs_sorted, notinseq)
873 else:
874 notinseq = []
875 inseq = []
876 rV[refpt] = (logv, logd, inseq, notinseq)
877 return rV
878
879
881 return {
882 'Dim': items[0],
883 'refpt': int(items[1]),
884 'r_min_minslope': items[2],
885 'r_min_maxslope': items[3],
886 'index': int(items[4])
887 }
888
1049
1050
1051 -def slope_range(data, refptix, step=2, Delta=10, startix=10, stopix=None,
1052 doplot=False, maxD=Inf):
1053 """Slope range by secants over a range of log(Delta) ball volumes"""
1054 N=size(data,0)
1055 assert Delta > 1
1056 assert N > Delta*startix
1057 logv_raw = log(range(1,N+1))
1058 logv, logd, d, d_inv_dict = log_distances_with_D(data, refptix, logv_raw)
1059 hi_slope = 0
1060 lo_slope = 1e10
1061 loix = startix
1062 hiix = loix*Delta
1063 if stopix is None:
1064 stopix = N
1065 assert stopix <= N
1066 assert hiix < stopix
1067 logDelta = log(Delta)
1068 slope_by_loix = {}
1069 while True:
1070
1071 slope = logDelta/(logd[hiix]-logd[loix])
1072 if slope > hi_slope and slope < maxD:
1073 hi_slope = slope
1074 hi_slope_ix = loix
1075 if slope < lo_slope and slope < maxD:
1076 lo_slope = slope
1077 lo_slope_ix = loix
1078 slope_by_loix[loix] = slope
1079 loix += step
1080 hiix = Delta*loix
1081 if hiix >= stopix:
1082 break
1083 if doplot:
1084 plot(sortedDictValues(slope_by_loix))
1085 return (slope_by_loix, lo_slope, lo_slope_ix,
1086 hi_slope, hi_slope_ix, logv, logd)
1087
1088
1089
1090 -def scatterplot_slopes(data, num_samples=None, startix=10, stopix=None,
1091 Delta=10, marker='o', marker_size=40, cmap=cm.hot,
1092 color_source=None, step=5, fignum=None, maxD=Inf):
1093 """fignum = 0 switches off plotting (just return statistics)"""
1094 if num_samples is None:
1095 num_samples = len(data)/5
1096 range_step = 5
1097 else:
1098 range_step = int(len(data)/num_samples)
1099 if range_step > 0:
1100 refpts = arange(0,len(data),range_step,'i')
1101 else:
1102 raise "too many sample points specified"
1103 num_refpts = len(refpts)
1104 plotdata = zeros((num_refpts,3),Float)
1105 ixdata = {}
1106
1107 colors = zeros((num_refpts,3),Float)
1108 if color_source == 'lo':
1109 cix=0
1110 title_str = 'min'
1111 elif color_source == 'hi':
1112 cix=1
1113 title_str = 'max'
1114 elif color_source is None:
1115 title_str = ''
1116 col = array([0.3,0.3,0.3])
1117 else:
1118 raise "invalid color source"
1119 print "Expect %i dots when finished:"%int(num_refpts/50)
1120 for i in range(num_refpts):
1121 if mod(i,50) == 49:
1122 print ".",
1123 slope_by_loix, lo, loix, hi, hiix, logv, logd = slope_range(data,
1124 refpts[i], startix=startix, stopix=stopix,
1125 Delta=Delta, step=step, maxD=maxD)
1126 plotdata[i,:] = array([lo,hi,refpts[i]])
1127 ixdata[i] = [(loix,hiix),refpts[i]]
1128 colors[i,:] = array([exp(logd[loix]),exp(logd[hiix]),refpts[i]])
1129 if fignum is None:
1130 figure()
1131 dofig = True
1132 elif fignum == 0:
1133 dofig = False
1134 else:
1135 figure(fignum)
1136 dofig = True
1137 if dofig:
1138 if color_source is None:
1139 handle=scatter(plotdata[:,0],plotdata[:,1],marker=marker,c=col,
1140 s=marker_size)
1141 else:
1142 handle=scatter(plotdata[:,0],plotdata[:,1],marker=marker,c=colors[:,cix],
1143 cmap=cmap,s=marker_size)
1144 colorbar()
1145 fonttext = args(fontsize=20,fontname='Times')
1146 xlabel(r'$\rm{min \ slope}$',fonttext)
1147 ylabel(r'$\rm{max \ slope}$',fonttext)
1148 if title_str != '':
1149 title('Color-coded by r_min of '+title_str+' slope')
1150 else:
1151 handle=None
1152 s=(sorted_by_slope(plotdata,0),sorted_by_slope(plotdata,1))
1153 if color_source is None:
1154 c = ()
1155 else:
1156 c=(sorted_by_radius(colors,0),sorted_by_radius(colors,1))
1157 stats = {'min_slope': {}, 'max_slope': {}}
1158 stats['min_slope']['mean']=mean(plotdata[:,0])
1159 stats['max_slope']['mean']=mean(plotdata[:,1])
1160 stats['min_slope']['std']=std(plotdata[:,0])
1161 stats['max_slope']['std']=std(plotdata[:,1])
1162 stats['min_slope']['min']=min(plotdata[:,0])
1163 stats['max_slope']['min']=min(plotdata[:,1])
1164 stats['min_slope']['max']=max(plotdata[:,0])
1165 stats['max_slope']['max']=max(plotdata[:,1])
1166 print "\n"
1167 return (plotdata, s, colors, c, ixdata, stats, handle)
1168
1169
1171 """select: 0 = rmin, 1 = rmax"""
1172 ixs = argsort(plotdata, 0)[:,select]
1173 return zip(take(plotdata, ixs, 0)[:,select], ixs)
1174
1176 """select: 0 = rmin, 1 = rmax"""
1177 ixs = argsort(colors, 0)[:,select]
1178 return zip(take(colors, ixs, 0)[:,select], ixs)
1179
1181 if isinstance(v, _seq_types):
1182 a=array(x)
1183 res = [find(a, y, next_largest, indices) for y in v]
1184 return res
1185 else:
1186 return find(array(x), v, next_largest, indices)
1187
1188
1189 -def rescatter(plotdata, colors, color_source=None, marker='o', marker_size=40,
1190 cmap=cm.hot, newfigure=True):
1191 if color_source == 'lo':
1192 cix=0
1193 title_str = 'min'
1194 elif color_source == 'hi':
1195 cix=1
1196 title_str = 'max'
1197 elif color_source is None:
1198 title_str = ''
1199 if not isinstance(colors, str):
1200 assert shape(colors) == (3,)
1201 else:
1202 raise "invalid color source"
1203 if newfigure:
1204 figure()
1205 if color_source is None:
1206 handle=scatter(plotdata[:,0],plotdata[:,1],marker=marker,
1207 c=colors,s=marker_size)
1208 else:
1209 handle=scatter(plotdata[:,0],plotdata[:,1],marker=marker,
1210 c=colors[:,cix],cmap=cmap,s=marker_size)
1211 colorbar()
1212 fonttext_v = args(fontsize=22,fontname='Times',
1213 horizontalalignment='right')
1214 fonttext_h = args(fontsize=22,fontname='Times',
1215 verticalalignment='top')
1216 xlabel(r'$\rm{min \ slope}$',fonttext_h)
1217 ylabel(r'$\rm{max \ slope}$',fonttext_v)
1218 if title_str != '':
1219 title('Color-coded by r_min of '+title_str+' slope')
1220 return handle
1221
1222
1223 -def scatter_histo(plotdata,select=1,bin_width=1,maxD=None,
1224 xlim=None,ylim=None):
1225 ds = array(plotdata[:,select])
1226 ds.shape = (len(plotdata),)
1227 if maxD is None:
1228 maxD = int(ceil(max(ds[:,0])))
1229 drange = arange(0, maxD+1+bin_width, bin_width, typecode='d')
1230 dbins = data_bins('D', drange)
1231 dbins.bin(ds)
1232 dbx, dby = dbins.to_array()
1233 f=figure()
1234 b=bar(dbx, dby, color='b', width=dbx[1]-dbx[0])
1235 if xlim is not None:
1236 f.axes[0].set_xlim(xlim)
1237 draw()
1238 if ylim is not None:
1239 f.axes[0].set_ylim(ylim)
1240 draw()
1241
1242
1243 -def PD_E(a, secfig=5, verbose=False, saveplot=True, force=False):
1244 """Pointwise dimension estimation (PD-E) using secants of r-V curves.
1245 See:
1246
1247 J. Guckenheimer, "Dimension Estimates for Attractors",
1248 Contemporary Mathematics, Vol. 28, 1984.
1249
1250 R. Clewley, J. Guckenheimer, F. Valero-Cuevas, "Estimating effective
1251 degrees of freedom in motor control", IEEE Transactions in Biomedical
1252 Engineering, 2007.
1253 """
1254 pde_name = 'PD_E-'+a.name
1255 if 'N' in a.keys():
1256 if 'stopix' in a.keys():
1257 if a.stopix is not None:
1258 a.stopix = int(round(0.7*a.N))
1259 else:
1260 a.stopix = int(round(0.7*a.N))
1261 a.num_samples = int(round(0.2*a.N))
1262 a.step = int(round(0.005*a.N))
1263 else:
1264 a.N = None
1265 if 'stopix' not in a.keys():
1266 a.stopix = None
1267 try:
1268 if force:
1269 raise ValueError
1270 data, pde_args, plotdata, ssorted, colors, \
1271 csorted, ixdata, sstats = loadObjects(pde_name)
1272 print "Loaded data set and stats for %s"%a.name
1273
1274 if filteredDict(a, ['data','bin_width'], neg=True) != \
1275 filteredDict(pde_args, ['data','bin_width'], neg=True):
1276 raise "Incompatible argument struct from file %s"%pde_name
1277 if a.data is None:
1278 a.data = data
1279 elif not all(a.data == data):
1280 raise "Incompatible data set loaded from file %s"%pde_name
1281 except ValueError:
1282 if a.data is None:
1283 try:
1284 data = eval(a.data_gen_str, globals(), a.data_gen_fun)
1285 except:
1286 print "Problem re-calculating data from given information"
1287 else:
1288 data = a.data
1289 print "Recalculating PD-E for %s"%a.name
1290 try:
1291 csource=a.color_source
1292 except AttributeError:
1293 csource=None
1294 if verbose:
1295 scatter_fig = None
1296 else:
1297 scatter_fig = 0
1298 if a.data is None:
1299 a.data = data
1300 plotdata, ssorted, colors, csorted, ixdata, sstats, handle = \
1301 scatterplot_slopes(data,
1302 startix=a.startix, stopix=a.stopix,
1303 num_samples=a.num_samples,
1304 Delta=a.Delta, color_source=csource, step=a.step,
1305 maxD=a.maxD, fignum=scatter_fig)
1306 saveObjects([data, a, plotdata, ssorted, colors, csorted, ixdata, sstats],
1307 pde_name, force=True)
1308 if verbose:
1309 scatter_histo(plotdata,bin_width=a.bin_width,select=0)
1310 scatter_histo(plotdata,bin_width=a.bin_width,select=1)
1311 info(sstats)
1312 if saveplot:
1313 save_info='result_'+a.name
1314 else:
1315 save_info=''
1316 if 'show_seq' in a.keys():
1317 show_seq = a['show_seq']
1318 else:
1319 show_seq = False
1320 more_info=prep_secant_figure(data, plotdata, ssorted, colors, ixdata, sstats,
1321 a.startix, a.stopix, a.Delta, secfig, save=save_info,
1322 show_seq=show_seq)
1323 try:
1324 more_info['handle']=handle
1325 except UnboundLocalError:
1326 more_info['handle']=None
1327 more_info['stats']=sstats
1328 more_info['args']=a
1329 return more_info
1330