 pyclustering  0.10.1 pyclustring is a Python, C++ data mining library.
metric.py
1 """!
2
3 @brief Module provides various distance metrics - abstraction of the notion of distance in a metric space.
4
5 @authors Andrei Novikov (pyclustering@yandex.ru)
6 @date 2014-2020
8
9 """
10
11
12 import numpy
13
14 from enum import IntEnum
15
16
17 class type_metric(IntEnum):
18  """!
19  @brief Enumeration of supported metrics in the module for distance calculation between two points.
20
21  """
22
23
24  EUCLIDEAN = 0
25
26
27  EUCLIDEAN_SQUARE = 1
28
29
30  MANHATTAN = 2
31
32
33  CHEBYSHEV = 3
34
35
36  MINKOWSKI = 4
37
38
39  CANBERRA = 5
40
41
42  CHI_SQUARE = 6
43
44
45  GOWER = 7
46
47
48  USER_DEFINED = 1000
49
50
51
53  """!
54  @brief Distance metric performs distance calculation between two points in line with encapsulated function, for
55  example, euclidean distance or chebyshev distance, or even user-defined.
56
57  @details
58
59  Example of Euclidean distance metric:
60  @code
61  metric = distance_metric(type_metric.EUCLIDEAN)
62  distance = metric([1.0, 2.5], [-1.2, 3.4])
63  @endcode
64
65  Example of Chebyshev distance metric:
66  @code
67  metric = distance_metric(type_metric.CHEBYSHEV)
68  distance = metric([0.0, 0.0], [2.5, 6.0])
69  @endcode
70
71  In following example additional argument should be specified (generally, 'degree' is a optional argument that is
72  equal to '2' by default) that is specific for Minkowski distance:
73  @code
74  metric = distance_metric(type_metric.MINKOWSKI, degree=4)
75  distance = metric([4.0, 9.2, 1.0], [3.4, 2.5, 6.2])
76  @endcode
77
78  User may define its own function for distance calculation. In this case input is two points, for example, you
79  want to implement your own version of Manhattan distance:
80  @code
81  from pyclustering.utils.metric import distance_metric, type_metric
82
83  def my_manhattan(point1, point2):
84  dimension = len(point1)
85  result = 0.0
86  for i in range(dimension):
87  result += abs(point1[i] - point2[i]) * 0.1
88  return result
89
90  metric = distance_metric(type_metric.USER_DEFINED, func=my_manhattan)
91  distance = metric([2.0, 3.0], [1.0, 3.0])
92  @endcode
93
94  """
95  def __init__(self, metric_type, **kwargs):
96  """!
97  @brief Creates distance metric instance for calculation distance between two points.
98
99  @param[in] metric_type (type_metric):
100  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'numpy_usage' 'func' and corresponding additional argument for
101  for specific metric types).
102
103  <b>Keyword Args:</b><br>
104  - func (callable): Callable object with two arguments (point #1 and point #2) or (object #1 and object #2) in case of numpy usage.
105  This argument is used only if metric is 'type_metric.USER_DEFINED'.
106  - degree (numeric): Only for 'type_metric.MINKOWSKI' - degree of Minkowski equation.
107  - max_range (array_like): Only for 'type_metric.GOWER' - max range in each dimension. 'data' can be used
109  - data (array_like): Only for 'type_metric.GOWER' - input data that used for 'max_range' calculation.
110  'max_range' can be used instead of this parameter.
111  - numpy_usage (bool): If True then numpy is used for calculation (by default is False).
112
113  """
114  self.__type = metric_type
115  self.__args = kwargs
116  self.__func = self.__args.get('func', None)
117  self.__numpy = self.__args.get('numpy_usage', False)
118
120
121
122  def __call__(self, point1, point2):
123  """!
124  @brief Calculates distance between two points.
125
126  @param[in] point1 (list): The first point.
127  @param[in] point2 (list): The second point.
128
129  @return (double) Distance between two points.
130
131  """
132  return self.__calculator(point1, point2)
133
134
135  def get_type(self):
136  """!
137  @brief Return type of distance metric that is used.
138
139  @return (type_metric) Type of distance metric.
140
141  """
142  return self.__type
143
144
145  def get_arguments(self):
146  """!
147  @brief Return additional arguments that are used by distance metric.
148
150
151  """
152  return self.__args
153
154
155  def get_function(self):
156  """!
157  @brief Return user-defined function for calculation distance metric.
158
159  @return (callable): User-defined distance metric function.
160
161  """
162  return self.__func
163
164
166  """!
167  @brief Start numpy for distance calculation.
168  @details Useful in case matrices to increase performance. No effect in case of type_metric.USER_DEFINED type.
169
170  """
171  self.__numpy = True
172  if self.__type != type_metric.USER_DEFINED:
174
175
177  """!
178  @brief Stop using numpy for distance calculation.
179  @details Useful in case of big amount of small data portion when numpy call is longer than calculation itself.
180  No effect in case of type_metric.USER_DEFINED type.
181
182  """
183  self.__numpy = False
185
186
187  def __create_distance_calculator(self):
188  """!
189  @brief Creates distance metric calculator.
190
191  @return (callable) Callable object of distance metric calculator.
192
193  """
194  if self.__numpy is True:
196
198
199
200  def __create_distance_calculator_basic(self):
201  """!
202  @brief Creates distance metric calculator that does not use numpy.
203
204  @return (callable) Callable object of distance metric calculator.
205
206  """
207  if self.__type == type_metric.EUCLIDEAN:
208  return euclidean_distance
209
210  elif self.__type == type_metric.EUCLIDEAN_SQUARE:
211  return euclidean_distance_square
212
213  elif self.__type == type_metric.MANHATTAN:
214  return manhattan_distance
215
216  elif self.__type == type_metric.CHEBYSHEV:
217  return chebyshev_distance
218
219  elif self.__type == type_metric.MINKOWSKI:
220  return lambda point1, point2: minkowski_distance(point1, point2, self.__args.get('degree', 2))
221
222  elif self.__type == type_metric.CANBERRA:
223  return canberra_distance
224
225  elif self.__type == type_metric.CHI_SQUARE:
226  return chi_square_distance
227
228  elif self.__type == type_metric.GOWER:
229  max_range = self.__get_gower_max_range()
230  return lambda point1, point2: gower_distance(point1, point2, max_range)
231
232  elif self.__type == type_metric.USER_DEFINED:
233  return self.__func
234
235  else:
236  raise ValueError("Unknown type of metric: '%d'", self.__type)
237
238
239  def __get_gower_max_range(self):
240  """!
241  @brief Returns max range for Gower distance using input parameters ('max_range' or 'data').
242
243  @return (numpy.array) Max range for Gower distance.
244
245  """
246  max_range = self.__args.get('max_range', None)
247  if max_range is None:
248  data = self.__args.get('data', None)
249  if data is None:
250  raise ValueError("Gower distance requires 'data' or 'max_range' argument to construct metric.")
251
252  max_range = numpy.max(data, axis=0) - numpy.min(data, axis=0)
253  self.__args['max_range'] = max_range
254
255  return max_range
256
257
258  def __create_distance_calculator_numpy(self):
259  """!
260  @brief Creates distance metric calculator that uses numpy.
261
262  @return (callable) Callable object of distance metric calculator.
263
264  """
265  if self.__type == type_metric.EUCLIDEAN:
266  return euclidean_distance_numpy
267
268  elif self.__type == type_metric.EUCLIDEAN_SQUARE:
269  return euclidean_distance_square_numpy
270
271  elif self.__type == type_metric.MANHATTAN:
272  return manhattan_distance_numpy
273
274  elif self.__type == type_metric.CHEBYSHEV:
275  return chebyshev_distance_numpy
276
277  elif self.__type == type_metric.MINKOWSKI:
278  return lambda object1, object2: minkowski_distance_numpy(object1, object2, self.__args.get('degree', 2))
279
280  elif self.__type == type_metric.CANBERRA:
281  return canberra_distance_numpy
282
283  elif self.__type == type_metric.CHI_SQUARE:
284  return chi_square_distance_numpy
285
286  elif self.__type == type_metric.GOWER:
287  max_range = self.__get_gower_max_range()
288  return lambda object1, object2: gower_distance_numpy(object1, object2, max_range)
289
290  elif self.__type == type_metric.USER_DEFINED:
291  return self.__func
292
293  else:
294  raise ValueError("Unknown type of metric: '%d'", self.__type)
295
296
297
298 def euclidean_distance(point1, point2):
299  """!
300  @brief Calculate Euclidean distance between two vectors.
301  @details The Euclidean between vectors (points) a and b is calculated by following formula:
302
303  \f[
304  dist(a, b) = \sqrt{ \sum_{i=0}^{N}(a_{i} - b_{i})^{2} };
305  \f]
306
307  Where N is a length of each vector.
308
309  @param[in] point1 (array_like): The first vector.
310  @param[in] point2 (array_like): The second vector.
311
312  @return (double) Euclidean distance between two vectors.
313
314  @see euclidean_distance_square, manhattan_distance, chebyshev_distance
315
316  """
317  distance = euclidean_distance_square(point1, point2)
318  return distance ** 0.5
319
320
321 def euclidean_distance_numpy(object1, object2):
322  """!
323  @brief Calculate Euclidean distance between two objects using numpy.
324
325  @param[in] object1 (array_like): The first array_like object.
326  @param[in] object2 (array_like): The second array_like object.
327
328  @return (double) Euclidean distance between two objects.
329
330  """
331  if len(object1.shape) > 1 or len(object2.shape) > 1:
332  return numpy.sqrt(numpy.sum(numpy.square(object1 - object2), axis=1))
333  else:
334  return numpy.sqrt(numpy.sum(numpy.square(object1 - object2)))
335
336
337 def euclidean_distance_square(point1, point2):
338  """!
339  @brief Calculate square Euclidean distance between two vectors.
340
341  \f[
342  dist(a, b) = \sum_{i=0}^{N}(a_{i} - b_{i})^{2};
343  \f]
344
345  @param[in] point1 (array_like): The first vector.
346  @param[in] point2 (array_like): The second vector.
347
348  @return (double) Square Euclidean distance between two vectors.
349
350  @see euclidean_distance, manhattan_distance, chebyshev_distance
351
352  """
353  distance = 0.0
354  for i in range(len(point1)):
355  distance += (point1[i] - point2[i]) ** 2.0
356
357  return distance
358
359
360 def euclidean_distance_square_numpy(object1, object2):
361  """!
362  @brief Calculate square Euclidean distance between two objects using numpy.
363
364  @param[in] object1 (array_like): The first array_like object.
365  @param[in] object2 (array_like): The second array_like object.
366
367  @return (double) Square Euclidean distance between two objects.
368
369  """
370  if len(object1.shape) > 1 or len(object2.shape) > 1:
371  return numpy.sum(numpy.square(object1 - object2), axis=1).T
372  else:
373  return numpy.sum(numpy.square(object1 - object2))
374
375
376 def manhattan_distance(point1, point2):
377  """!
378  @brief Calculate Manhattan distance between between two vectors.
379
380  \f[
381  dist(a, b) = \sum_{i=0}^{N}\left | a_{i} - b_{i} \right |;
382  \f]
383
384  @param[in] point1 (array_like): The first vector.
385  @param[in] point2 (array_like): The second vector.
386
387  @return (double) Manhattan distance between two vectors.
388
389  @see euclidean_distance_square, euclidean_distance, chebyshev_distance
390
391  """
392  distance = 0.0
393  dimension = len(point1)
394
395  for i in range(dimension):
396  distance += abs(point1[i] - point2[i])
397
398  return distance
399
400
401 def manhattan_distance_numpy(object1, object2):
402  """!
403  @brief Calculate Manhattan distance between two objects using numpy.
404
405  @param[in] object1 (array_like): The first array_like object.
406  @param[in] object2 (array_like): The second array_like object.
407
408  @return (double) Manhattan distance between two objects.
409
410  """
411  if len(object1.shape) > 1 or len(object2.shape) > 1:
412  return numpy.sum(numpy.absolute(object1 - object2), axis=1).T
413  else:
414  return numpy.sum(numpy.absolute(object1 - object2))
415
416
417 def chebyshev_distance(point1, point2):
418  """!
419  @brief Calculate Chebyshev distance (maximum metric) between between two vectors.
420  @details Chebyshev distance is a metric defined on a vector space where the distance between two vectors is the
421  greatest of their differences along any coordinate dimension.
422
423  \f[
424  dist(a, b) = \max_{}i\left (\left | a_{i} - b_{i} \right |\right );
425  \f]
426
427  @param[in] point1 (array_like): The first vector.
428  @param[in] point2 (array_like): The second vector.
429
430  @return (double) Chebyshev distance between two vectors.
431
432  @see euclidean_distance_square, euclidean_distance, minkowski_distance
433
434  """
435  distance = 0.0
436  dimension = len(point1)
437
438  for i in range(dimension):
439  distance = max(distance, abs(point1[i] - point2[i]))
440
441  return distance
442
443
444 def chebyshev_distance_numpy(object1, object2):
445  """!
446  @brief Calculate Chebyshev distance between two objects using numpy.
447
448  @param[in] object1 (array_like): The first array_like object.
449  @param[in] object2 (array_like): The second array_like object.
450
451  @return (double) Chebyshev distance between two objects.
452
453  """
454  if len(object1.shape) > 1 or len(object2.shape) > 1:
455  return numpy.max(numpy.absolute(object1 - object2), axis=1).T
456  else:
457  return numpy.max(numpy.absolute(object1 - object2))
458
459
460 def minkowski_distance(point1, point2, degree=2):
461  """!
462  @brief Calculate Minkowski distance between two vectors.
463
464  \f[
465  dist(a, b) = \sqrt[p]{ \sum_{i=0}^{N}\left(a_{i} - b_{i}\right)^{p} };
466  \f]
467
468  @param[in] point1 (array_like): The first vector.
469  @param[in] point2 (array_like): The second vector.
470  @param[in] degree (numeric): Degree of that is used for Minkowski distance.
471
472  @return (double) Minkowski distance between two vectors.
473
474  @see euclidean_distance
475
476  """
477  distance = 0.0
478  for i in range(len(point1)):
479  distance += (point1[i] - point2[i]) ** degree
480
481  return distance ** (1.0 / degree)
482
483
484 def minkowski_distance_numpy(object1, object2, degree=2):
485  """!
486  @brief Calculate Minkowski distance between objects using numpy.
487
488  @param[in] object1 (array_like): The first array_like object.
489  @param[in] object2 (array_like): The second array_like object.
490  @param[in] degree (numeric): Degree of that is used for Minkowski distance.
491
492  @return (double) Minkowski distance between two object.
493
494  """
495  if len(object1.shape) > 1 or len(object2.shape) > 1:
496  return numpy.power(numpy.sum(numpy.power(object1 - object2, degree), axis=1), 1/degree)
497  else:
498  return numpy.power(numpy.sum(numpy.power(object1 - object2, degree)), 1 / degree)
499
500
501 def canberra_distance(point1, point2):
502  """!
503  @brief Calculate Canberra distance between two vectors.
504
505  \f[
506  dist(a, b) = \sum_{i=0}^{N}\frac{\left | a_{i} - b_{i} \right |}{\left | a_{i} \right | + \left | b_{i} \right |};
507  \f]
508
509  @param[in] point1 (array_like): The first vector.
510  @param[in] point2 (array_like): The second vector.
511
512  @return (float) Canberra distance between two objects.
513
514  """
515  distance = 0.0
516  for i in range(len(point1)):
517  divider = abs(point1[i]) + abs(point2[i])
518  if divider == 0.0:
519  continue
520
521  distance += abs(point1[i] - point2[i]) / divider
522
523  return distance
524
525
526 def canberra_distance_numpy(object1, object2):
527  """!
528  @brief Calculate Canberra distance between two objects using numpy.
529
530  @param[in] object1 (array_like): The first vector.
531  @param[in] object2 (array_like): The second vector.
532
533  @return (float) Canberra distance between two objects.
534
535  """
536  with numpy.errstate(divide='ignore', invalid='ignore'):
537  result = numpy.divide(numpy.abs(object1 - object2), numpy.abs(object1) + numpy.abs(object2))
538
539  if len(result.shape) > 1:
540  return numpy.sum(numpy.nan_to_num(result), axis=1).T
541  else:
542  return numpy.sum(numpy.nan_to_num(result))
543
544
545 def chi_square_distance(point1, point2):
546  """!
547  @brief Calculate Chi square distance between two vectors.
548
549  \f[
550  dist(a, b) = \sum_{i=0}^{N}\frac{\left ( a_{i} - b_{i} \right )^{2}}{\left | a_{i} \right | + \left | b_{i} \right |};
551  \f]
552
553  @param[in] point1 (array_like): The first vector.
554  @param[in] point2 (array_like): The second vector.
555
556  @return (float) Chi square distance between two objects.
557
558  """
559  distance = 0.0
560  for i in range(len(point1)):
561  divider = abs(point1[i]) + abs(point2[i])
562  if divider != 0.0:
563  distance += ((point1[i] - point2[i]) ** 2.0) / divider
564
565  return distance
566
567
568 def chi_square_distance_numpy(object1, object2):
569  """!
570  @brief Calculate Chi square distance between two vectors using numpy.
571
572  @param[in] object1 (array_like): The first vector.
573  @param[in] object2 (array_like): The second vector.
574
575  @return (float) Chi square distance between two objects.
576
577  """
578  with numpy.errstate(divide='ignore', invalid='ignore'):
579  result = numpy.divide(numpy.power(object1 - object2, 2), numpy.abs(object1) + numpy.abs(object2))
580
581  if len(result.shape) > 1:
582  return numpy.sum(numpy.nan_to_num(result), axis=1).T
583  else:
584  return numpy.sum(numpy.nan_to_num(result))
585
586
587 def gower_distance(point1, point2, max_range):
588  """!
589  @brief Calculate Gower distance between two vectors.
590  @details Implementation is based on the paper @cite article::utils::metric::gower. Gower distance is calculate
591  using following formula:
592  \f[
593  dist\left ( a, b \right )=\frac{1}{p}\sum_{i=0}^{p}\frac{\left | a_{i} - b_{i} \right |}{R_{i}},
594  \f]
595
596  where \f$R_{i}\f$ is a max range for ith dimension. \f$R\f$ is defined in line following formula:
597
598  \f[
599  R=max\left ( X \right )-min\left ( X \right )
600  \f]
601
602  @param[in] point1 (array_like): The first vector.
603  @param[in] point2 (array_like): The second vector.
604  @param[in] max_range (array_like): Max range in each data dimension.
605
606  @return (float) Gower distance between two objects.
607
608  """
609  distance = 0.0
610  dimensions = len(point1)
611  for i in range(dimensions):
612  if max_range[i] != 0.0:
613  distance += abs(point1[i] - point2[i]) / max_range[i]
614
615  return distance / dimensions
616
617
618 def gower_distance_numpy(point1, point2, max_range):
619  """!
620  @brief Calculate Gower distance between two vectors using numpy.
621
622  @param[in] point1 (array_like): The first vector.
623  @param[in] point2 (array_like): The second vector.
624  @param[in] max_range (array_like): Max range in each data dimension.
625
626  @return (float) Gower distance between two objects.
627
628  """
629  with numpy.errstate(divide='ignore', invalid='ignore'):
630  result = numpy.divide(numpy.abs(point1 - point2), max_range)
631
632  if len(result.shape) > 1:
633  return numpy.sum(numpy.nan_to_num(result), axis=1).T / len(result)
634  else:
635  return numpy.sum(numpy.nan_to_num(result)) / len(point1)
pyclustering.utils.metric.euclidean_distance_square_numpy
def euclidean_distance_square_numpy(object1, object2)
Calculate square Euclidean distance between two objects using numpy.
Definition: metric.py:360
pyclustering.utils.metric.gower_distance_numpy
def gower_distance_numpy(point1, point2, max_range)
Calculate Gower distance between two vectors using numpy.
Definition: metric.py:618
pyclustering.utils.metric.distance_metric.__func
__func
Definition: metric.py:116
pyclustering.utils.metric.distance_metric.enable_numpy_usage
def enable_numpy_usage(self)
Start numpy for distance calculation.
Definition: metric.py:165
pyclustering.utils.metric.euclidean_distance_numpy
def euclidean_distance_numpy(object1, object2)
Calculate Euclidean distance between two objects using numpy.
Definition: metric.py:321
pyclustering.utils.metric.distance_metric.__create_distance_calculator
def __create_distance_calculator(self)
Creates distance metric calculator.
Definition: metric.py:187
pyclustering.utils.metric.chebyshev_distance
def chebyshev_distance(point1, point2)
Calculate Chebyshev distance (maximum metric) between between two vectors.
Definition: metric.py:417
pyclustering.utils.metric.euclidean_distance
def euclidean_distance(point1, point2)
Calculate Euclidean distance between two vectors.
Definition: metric.py:298
pyclustering.utils.metric.minkowski_distance_numpy
def minkowski_distance_numpy(object1, object2, degree=2)
Calculate Minkowski distance between objects using numpy.
Definition: metric.py:484
pyclustering.utils.metric.distance_metric.__create_distance_calculator_numpy
def __create_distance_calculator_numpy(self)
Creates distance metric calculator that uses numpy.
Definition: metric.py:258
pyclustering.utils.metric.canberra_distance_numpy
def canberra_distance_numpy(object1, object2)
Calculate Canberra distance between two objects using numpy.
Definition: metric.py:526
pyclustering.utils.metric.distance_metric
Distance metric performs distance calculation between two points in line with encapsulated function,...
Definition: metric.py:52
pyclustering.utils.metric.manhattan_distance
def manhattan_distance(point1, point2)
Calculate Manhattan distance between between two vectors.
Definition: metric.py:376
pyclustering.utils.metric.minkowski_distance
def minkowski_distance(point1, point2, degree=2)
Calculate Minkowski distance between two vectors.
Definition: metric.py:460
pyclustering.utils.metric.distance_metric.get_function
def get_function(self)
Return user-defined function for calculation distance metric.
Definition: metric.py:155
pyclustering.utils.metric.distance_metric.__call__
def __call__(self, point1, point2)
Calculates distance between two points.
Definition: metric.py:122
pyclustering.utils.metric.distance_metric.__type
__type
Definition: metric.py:114
pyclustering.utils.metric.distance_metric.get_type
def get_type(self)
Return type of distance metric that is used.
Definition: metric.py:135
pyclustering.utils.metric.distance_metric.__calculator
__calculator
Definition: metric.py:119
pyclustering.utils.metric.chi_square_distance
def chi_square_distance(point1, point2)
Calculate Chi square distance between two vectors.
Definition: metric.py:545
pyclustering.utils.metric.distance_metric.__init__
def __init__(self, metric_type, **kwargs)
Creates distance metric instance for calculation distance between two points.
Definition: metric.py:95
pyclustering.utils.metric.manhattan_distance_numpy
def manhattan_distance_numpy(object1, object2)
Calculate Manhattan distance between two objects using numpy.
Definition: metric.py:401
pyclustering.utils.metric.canberra_distance
def canberra_distance(point1, point2)
Calculate Canberra distance between two vectors.
Definition: metric.py:501
pyclustering.utils.metric.gower_distance
def gower_distance(point1, point2, max_range)
Calculate Gower distance between two vectors.
Definition: metric.py:587
pyclustering.utils.metric.distance_metric.__numpy
__numpy
Definition: metric.py:117
pyclustering.utils.metric.distance_metric.__args
__args
Definition: metric.py:115
pyclustering.utils.metric.distance_metric.__get_gower_max_range
def __get_gower_max_range(self)
Returns max range for Gower distance using input parameters ('max_range' or 'data').
Definition: metric.py:239
pyclustering.utils.metric.type_metric
Enumeration of supported metrics in the module for distance calculation between two points.
Definition: metric.py:17
pyclustering.utils.metric.distance_metric.disable_numpy_usage
def disable_numpy_usage(self)
Stop using numpy for distance calculation.
Definition: metric.py:176
pyclustering.utils.metric.distance_metric.__create_distance_calculator_basic
def __create_distance_calculator_basic(self)
Creates distance metric calculator that does not use numpy.
Definition: metric.py:200
pyclustering.utils.metric.chebyshev_distance_numpy
def chebyshev_distance_numpy(object1, object2)
Calculate Chebyshev distance between two objects using numpy.
Definition: metric.py:444
pyclustering.utils.metric.distance_metric.get_arguments
def get_arguments(self)
Return additional arguments that are used by distance metric.
Definition: metric.py:145
pyclustering.utils.metric.euclidean_distance_square
def euclidean_distance_square(point1, point2)
Calculate square Euclidean distance between two vectors.
Definition: metric.py:337
pyclustering.utils.metric.chi_square_distance_numpy
def chi_square_distance_numpy(object1, object2)
Calculate Chi square distance between two vectors using numpy.
Definition: metric.py:568