 metric.py
1 """!
2
3 @brief Module provides various distance metrics - abstraction of the notion of distance in a metric space.
4
5 @authors Andrei Novikov (pyclustering@yandex.ru)
6 @date 2014-2020
8
10  PyClustering is free software: you can redistribute it and/or modify
12  the Free Software Foundation, either version 3 of the License, or
13  (at your option) any later version.
14
15  PyClustering is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  GNU General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with this program. If not, see <http://www.gnu.org/licenses/>.
22 @endcond
23
24 """
25
26
27 import numpy
28
29 from enum import IntEnum
30
31
32 class type_metric(IntEnum):
33  """!
34  @brief Enumeration of supported metrics in the module for distance calculation between two points.
35
36  """
37
38
39  EUCLIDEAN = 0
40
41
42  EUCLIDEAN_SQUARE = 1
43
44
45  MANHATTAN = 2
46
47
48  CHEBYSHEV = 3
49
50
51  MINKOWSKI = 4
52
53
54  CANBERRA = 5
55
56
57  CHI_SQUARE = 6
58
59
60  GOWER = 7
61
62
63  USER_DEFINED = 1000
64
65
66
68  """!
69  @brief Distance metric performs distance calculation between two points in line with encapsulated function, for
70  example, euclidean distance or chebyshev distance, or even user-defined.
71
72  @details
73
74  Example of Euclidean distance metric:
75  @code
76  metric = distance_metric(type_metric.EUCLIDEAN)
77  distance = metric([1.0, 2.5], [-1.2, 3.4])
78  @endcode
79
80  Example of Chebyshev distance metric:
81  @code
82  metric = distance_metric(type_metric.CHEBYSHEV)
83  distance = metric([0.0, 0.0], [2.5, 6.0])
84  @endcode
85
86  In following example additional argument should be specified (generally, 'degree' is a optional argument that is
87  equal to '2' by default) that is specific for Minkowski distance:
88  @code
89  metric = distance_metric(type_metric.MINKOWSKI, degree=4)
90  distance = metric([4.0, 9.2, 1.0], [3.4, 2.5, 6.2])
91  @endcode
92
93  User may define its own function for distance calculation. In this case input is two points, for example, you
94  want to implement your own version of Manhattan distance:
95  @code
96  from pyclustering.utils.metric import distance_metric, type_metric
97
98  def my_manhattan(point1, point2):
99  dimension = len(point1)
100  result = 0.0
101  for i in range(dimension):
102  result += abs(point1[i] - point2[i]) * 0.1
103  return result
104
105  metric = distance_metric(type_metric.USER_DEFINED, func=my_manhattan)
106  distance = metric([2.0, 3.0], [1.0, 3.0])
107  @endcode
108
109  """
110  def __init__(self, metric_type, **kwargs):
111  """!
112  @brief Creates distance metric instance for calculation distance between two points.
113
114  @param[in] metric_type (type_metric):
115  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'numpy_usage' 'func' and corresponding additional argument for
116  for specific metric types).
117
118  <b>Keyword Args:</b><br>
119  - func (callable): Callable object with two arguments (point #1 and point #2) or (object #1 and object #2) in case of numpy usage.
120  This argument is used only if metric is 'type_metric.USER_DEFINED'.
121  - degree (numeric): Only for 'type_metric.MINKOWSKI' - degree of Minkowski equation.
122  - max_range (array_like): Only for 'type_metric.GOWER' - max range in each dimension. 'data' can be used
124  - data (array_like): Only for 'type_metric.GOWER' - input data that used for 'max_range' calculation.
125  'max_range' can be used instead of this parameter.
126  - numpy_usage (bool): If True then numpy is used for calculation (by default is False).
127
128  """
129  self.__type = metric_type
130  self.__args = kwargs
131  self.__func = self.__args.get('func', None)
132  self.__numpy = self.__args.get('numpy_usage', False)
133
135
136
137  def __call__(self, point1, point2):
138  """!
139  @brief Calculates distance between two points.
140
141  @param[in] point1 (list): The first point.
142  @param[in] point2 (list): The second point.
143
144  @return (double) Distance between two points.
145
146  """
147  return self.__calculator(point1, point2)
148
149
150  def get_type(self):
151  """!
152  @brief Return type of distance metric that is used.
153
154  @return (type_metric) Type of distance metric.
155
156  """
157  return self.__type
158
159
160  def get_arguments(self):
161  """!
162  @brief Return additional arguments that are used by distance metric.
163
165
166  """
167  return self.__args
168
169
170  def get_function(self):
171  """!
172  @brief Return user-defined function for calculation distance metric.
173
174  @return (callable): User-defined distance metric function.
175
176  """
177  return self.__func
178
179
181  """!
182  @brief Start numpy for distance calculation.
183  @details Useful in case matrices to increase performance. No effect in case of type_metric.USER_DEFINED type.
184
185  """
186  self.__numpy = True
187  if self.__type != type_metric.USER_DEFINED:
189
190
192  """!
193  @brief Stop using numpy for distance calculation.
194  @details Useful in case of big amount of small data portion when numpy call is longer than calculation itself.
195  No effect in case of type_metric.USER_DEFINED type.
196
197  """
198  self.__numpy = False
200
201
202  def __create_distance_calculator(self):
203  """!
204  @brief Creates distance metric calculator.
205
206  @return (callable) Callable object of distance metric calculator.
207
208  """
209  if self.__numpy is True:
211
213
214
215  def __create_distance_calculator_basic(self):
216  """!
217  @brief Creates distance metric calculator that does not use numpy.
218
219  @return (callable) Callable object of distance metric calculator.
220
221  """
222  if self.__type == type_metric.EUCLIDEAN:
223  return euclidean_distance
224
225  elif self.__type == type_metric.EUCLIDEAN_SQUARE:
226  return euclidean_distance_square
227
228  elif self.__type == type_metric.MANHATTAN:
229  return manhattan_distance
230
231  elif self.__type == type_metric.CHEBYSHEV:
232  return chebyshev_distance
233
234  elif self.__type == type_metric.MINKOWSKI:
235  return lambda point1, point2: minkowski_distance(point1, point2, self.__args.get('degree', 2))
236
237  elif self.__type == type_metric.CANBERRA:
238  return canberra_distance
239
240  elif self.__type == type_metric.CHI_SQUARE:
241  return chi_square_distance
242
243  elif self.__type == type_metric.GOWER:
244  max_range = self.__get_gower_max_range()
245  return lambda point1, point2: gower_distance(point1, point2, max_range)
246
247  elif self.__type == type_metric.USER_DEFINED:
248  return self.__func
249
250  else:
251  raise ValueError("Unknown type of metric: '%d'", self.__type)
252
253
254  def __get_gower_max_range(self):
255  """!
256  @brief Returns max range for Gower distance using input parameters ('max_range' or 'data').
257
258  @return (numpy.array) Max range for Gower distance.
259
260  """
261  max_range = self.__args.get('max_range', None)
262  if max_range is None:
263  data = self.__args.get('data', None)
264  if data is None:
265  raise ValueError("Gower distance requires 'data' or 'max_range' argument to construct metric.")
266
267  max_range = numpy.max(data, axis=0) - numpy.min(data, axis=0)
268  self.__args['max_range'] = max_range
269
270  return max_range
271
272
273  def __create_distance_calculator_numpy(self):
274  """!
275  @brief Creates distance metric calculator that uses numpy.
276
277  @return (callable) Callable object of distance metric calculator.
278
279  """
280  if self.__type == type_metric.EUCLIDEAN:
281  return euclidean_distance_numpy
282
283  elif self.__type == type_metric.EUCLIDEAN_SQUARE:
284  return euclidean_distance_square_numpy
285
286  elif self.__type == type_metric.MANHATTAN:
287  return manhattan_distance_numpy
288
289  elif self.__type == type_metric.CHEBYSHEV:
290  return chebyshev_distance_numpy
291
292  elif self.__type == type_metric.MINKOWSKI:
293  return lambda object1, object2: minkowski_distance_numpy(object1, object2, self.__args.get('degree', 2))
294
295  elif self.__type == type_metric.CANBERRA:
296  return canberra_distance_numpy
297
298  elif self.__type == type_metric.CHI_SQUARE:
299  return chi_square_distance_numpy
300
301  elif self.__type == type_metric.GOWER:
302  max_range = self.__get_gower_max_range()
303  return lambda object1, object2: gower_distance_numpy(object1, object2, max_range)
304
305  elif self.__type == type_metric.USER_DEFINED:
306  return self.__func
307
308  else:
309  raise ValueError("Unknown type of metric: '%d'", self.__type)
310
311
312
313 def euclidean_distance(point1, point2):
314  """!
315  @brief Calculate Euclidean distance between two vectors.
316  @details The Euclidean between vectors (points) a and b is calculated by following formula:
317
318  \f[
319  dist(a, b) = \sqrt{ \sum_{i=0}^{N}(a_{i} - b_{i})^{2} };
320  \f]
321
322  Where N is a length of each vector.
323
324  @param[in] point1 (array_like): The first vector.
325  @param[in] point2 (array_like): The second vector.
326
327  @return (double) Euclidean distance between two vectors.
328
329  @see euclidean_distance_square, manhattan_distance, chebyshev_distance
330
331  """
332  distance = euclidean_distance_square(point1, point2)
333  return distance ** 0.5
334
335
336 def euclidean_distance_numpy(object1, object2):
337  """!
338  @brief Calculate Euclidean distance between two objects using numpy.
339
340  @param[in] object1 (array_like): The first array_like object.
341  @param[in] object2 (array_like): The second array_like object.
342
343  @return (double) Euclidean distance between two objects.
344
345  """
346  return numpy.sum(numpy.sqrt(numpy.square(object1 - object2)), axis=1).T
347
348
349 def euclidean_distance_square(point1, point2):
350  """!
351  @brief Calculate square Euclidean distance between two vectors.
352
353  \f[
354  dist(a, b) = \sum_{i=0}^{N}(a_{i} - b_{i})^{2};
355  \f]
356
357  @param[in] point1 (array_like): The first vector.
358  @param[in] point2 (array_like): The second vector.
359
360  @return (double) Square Euclidean distance between two vectors.
361
362  @see euclidean_distance, manhattan_distance, chebyshev_distance
363
364  """
365  distance = 0.0
366  for i in range(len(point1)):
367  distance += (point1[i] - point2[i]) ** 2.0
368
369  return distance
370
371
372 def euclidean_distance_square_numpy(object1, object2):
373  """!
374  @brief Calculate square Euclidean distance between two objects using numpy.
375
376  @param[in] object1 (array_like): The first array_like object.
377  @param[in] object2 (array_like): The second array_like object.
378
379  @return (double) Square Euclidean distance between two objects.
380
381  """
382  return numpy.sum(numpy.square(object1 - object2), axis=1).T
383
384
385 def manhattan_distance(point1, point2):
386  """!
387  @brief Calculate Manhattan distance between between two vectors.
388
389  \f[
390  dist(a, b) = \sum_{i=0}^{N}\left | a_{i} - b_{i} \right |;
391  \f]
392
393  @param[in] point1 (array_like): The first vector.
394  @param[in] point2 (array_like): The second vector.
395
396  @return (double) Manhattan distance between two vectors.
397
398  @see euclidean_distance_square, euclidean_distance, chebyshev_distance
399
400  """
401  distance = 0.0
402  dimension = len(point1)
403
404  for i in range(dimension):
405  distance += abs(point1[i] - point2[i])
406
407  return distance
408
409
410 def manhattan_distance_numpy(object1, object2):
411  """!
412  @brief Calculate Manhattan distance between two objects using numpy.
413
414  @param[in] object1 (array_like): The first array_like object.
415  @param[in] object2 (array_like): The second array_like object.
416
417  @return (double) Manhattan distance between two objects.
418
419  """
420  return numpy.sum(numpy.absolute(object1 - object2), axis=1).T
421
422
423 def chebyshev_distance(point1, point2):
424  """!
425  @brief Calculate Chebyshev distance (maximum metric) between between two vectors.
426  @details Chebyshev distance is a metric defined on a vector space where the distance between two vectors is the
427  greatest of their differences along any coordinate dimension.
428
429  \f[
430  dist(a, b) = \max_{}i\left (\left | a_{i} - b_{i} \right |\right );
431  \f]
432
433  @param[in] point1 (array_like): The first vector.
434  @param[in] point2 (array_like): The second vector.
435
436  @return (double) Chebyshev distance between two vectors.
437
438  @see euclidean_distance_square, euclidean_distance, minkowski_distance
439
440  """
441  distance = 0.0
442  dimension = len(point1)
443
444  for i in range(dimension):
445  distance = max(distance, abs(point1[i] - point2[i]))
446
447  return distance
448
449
450 def chebyshev_distance_numpy(object1, object2):
451  """!
452  @brief Calculate Chebyshev distance between two objects using numpy.
453
454  @param[in] object1 (array_like): The first array_like object.
455  @param[in] object2 (array_like): The second array_like object.
456
457  @return (double) Chebyshev distance between two objects.
458
459  """
460  return numpy.max(numpy.absolute(object1 - object2), axis=1).T
461
462
463 def minkowski_distance(point1, point2, degree=2):
464  """!
465  @brief Calculate Minkowski distance between two vectors.
466
467  \f[
468  dist(a, b) = \sqrt[p]{ \sum_{i=0}^{N}\left(a_{i} - b_{i}\right)^{p} };
469  \f]
470
471  @param[in] point1 (array_like): The first vector.
472  @param[in] point2 (array_like): The second vector.
473  @param[in] degree (numeric): Degree of that is used for Minkowski distance.
474
475  @return (double) Minkowski distance between two vectors.
476
477  @see euclidean_distance
478
479  """
480  distance = 0.0
481  for i in range(len(point1)):
482  distance += (point1[i] - point2[i]) ** degree
483
484  return distance ** (1.0 / degree)
485
486
487 def minkowski_distance_numpy(object1, object2, degree=2):
488  """!
489  @brief Calculate Minkowski distance between objects using numpy.
490
491  @param[in] object1 (array_like): The first array_like object.
492  @param[in] object2 (array_like): The second array_like object.
493  @param[in] degree (numeric): Degree of that is used for Minkowski distance.
494
495  @return (double) Minkowski distance between two object.
496
497  """
498  return numpy.sum(numpy.power(numpy.power(object1 - object2, degree), 1/degree), axis=1).T
499
500
501 def canberra_distance(point1, point2):
502  """!
503  @brief Calculate Canberra distance between two vectors.
504
505  \f[
506  dist(a, b) = \sum_{i=0}^{N}\frac{\left | a_{i} - b_{i} \right |}{\left | a_{i} \right | + \left | b_{i} \right |};
507  \f]
508
509  @param[in] point1 (array_like): The first vector.
510  @param[in] point2 (array_like): The second vector.
511
512  @return (float) Canberra distance between two objects.
513
514  """
515  distance = 0.0
516  for i in range(len(point1)):
517  divider = abs(point1[i]) + abs(point2[i])
518  if divider == 0.0:
519  continue
520
521  distance += abs(point1[i] - point2[i]) / divider
522
523  return distance
524
525
526 def canberra_distance_numpy(object1, object2):
527  """!
528  @brief Calculate Canberra distance between two objects using numpy.
529
530  @param[in] object1 (array_like): The first vector.
531  @param[in] object2 (array_like): The second vector.
532
533  @return (float) Canberra distance between two objects.
534
535  """
536  with numpy.errstate(divide='ignore', invalid='ignore'):
537  result = numpy.divide(numpy.abs(object1 - object2), numpy.abs(object1) + numpy.abs(object2))
538
539  if len(result.shape) > 1:
540  return numpy.sum(numpy.nan_to_num(result), axis=1).T
541  else:
542  return numpy.sum(numpy.nan_to_num(result))
543
544
545 def chi_square_distance(point1, point2):
546  """!
547  @brief Calculate Chi square distance between two vectors.
548
549  \f[
550  dist(a, b) = \sum_{i=0}^{N}\frac{\left ( a_{i} - b_{i} \right )^{2}}{\left | a_{i} \right | + \left | b_{i} \right |};
551  \f]
552
553  @param[in] point1 (array_like): The first vector.
554  @param[in] point2 (array_like): The second vector.
555
556  @return (float) Chi square distance between two objects.
557
558  """
559  distance = 0.0
560  for i in range(len(point1)):
561  divider = abs(point1[i]) + abs(point2[i])
562  if divider != 0.0:
563  distance += ((point1[i] - point2[i]) ** 2.0) / divider
564
565  return distance
566
567
568 def chi_square_distance_numpy(object1, object2):
569  """!
570  @brief Calculate Chi square distance between two vectors using numpy.
571
572  @param[in] object1 (array_like): The first vector.
573  @param[in] object2 (array_like): The second vector.
574
575  @return (float) Chi square distance between two objects.
576
577  """
578  with numpy.errstate(divide='ignore', invalid='ignore'):
579  result = numpy.divide(numpy.power(object1 - object2, 2), numpy.abs(object1) + numpy.abs(object2))
580
581  if len(result.shape) > 1:
582  return numpy.sum(numpy.nan_to_num(result), axis=1).T
583  else:
584  return numpy.sum(numpy.nan_to_num(result))
585
586
587 def gower_distance(point1, point2, max_range):
588  """!
589  @brief Calculate Gower distance between two vectors.
590  @details Implementation is based on the paper @cite article::utils::metric::gower. Gower distance is calculate
591  using following formula:
592  \f[
593  dist\left ( a, b \right )=\frac{1}{p}\sum_{i=0}^{p}\frac{\left | a_{i} - b_{i} \right |}{R_{i}},
594  \f]
595
596  where \f$R_{i}\f$ is a max range for ith dimension. \f$R\f$ is defined in line following formula:
597
598  \f[
599  R=max\left ( X \right )-min\left ( X \right )
600  \f]
601
602  @param[in] point1 (array_like): The first vector.
603  @param[in] point2 (array_like): The second vector.
604  @param[in] max_range (array_like): Max range in each data dimension.
605
606  @return (float) Gower distance between two objects.
607
608  """
609  distance = 0.0
610  dimensions = len(point1)
611  for i in range(dimensions):
612  if max_range[i] != 0.0:
613  distance += abs(point1[i] - point2[i]) / max_range[i]
614
615  return distance / dimensions
616
617
618 def gower_distance_numpy(point1, point2, max_range):
619  """!
620  @brief Calculate Gower distance between two vectors using numpy.
621
622  @param[in] point1 (array_like): The first vector.
623  @param[in] point2 (array_like): The second vector.
624  @param[in] max_range (array_like): Max range in each data dimension.
625
626  @return (float) Gower distance between two objects.
627
628  """
629  with numpy.errstate(divide='ignore', invalid='ignore'):
630  result = numpy.divide(numpy.abs(point1 - point2), max_range)
631
632  if len(result.shape) > 1:
633  return numpy.sum(numpy.nan_to_num(result), axis=1).T / len(point1)
634  else:
635  return numpy.sum(numpy.nan_to_num(result)) / len(point1)
def __create_distance_calculator_basic(self)
Creates distance metric calculator that does not use numpy.
Definition: metric.py:215
def chi_square_distance(point1, point2)
Calculate Chi square distance between two vectors.
Definition: metric.py:545
def get_arguments(self)
Return additional arguments that are used by distance metric.
Definition: metric.py:160
def euclidean_distance_square(point1, point2)
Calculate square Euclidean distance between two vectors.
Definition: metric.py:349
def minkowski_distance_numpy(object1, object2, degree=2)
Calculate Minkowski distance between objects using numpy.
Definition: metric.py:487
def chi_square_distance_numpy(object1, object2)
Calculate Chi square distance between two vectors using numpy.
Definition: metric.py:568
def __create_distance_calculator(self)
Creates distance metric calculator.
Definition: metric.py:202
def get_type(self)
Return type of distance metric that is used.
Definition: metric.py:150
def chebyshev_distance_numpy(object1, object2)
Calculate Chebyshev distance between two objects using numpy.
Definition: metric.py:450
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:67
def manhattan_distance_numpy(object1, object2)
Calculate Manhattan distance between two objects using numpy.
Definition: metric.py:410
def __init__(self, metric_type, kwargs)
Creates distance metric instance for calculation distance between two points.
Definition: metric.py:110
def gower_distance(point1, point2, max_range)
Calculate Gower distance between two vectors.
Definition: metric.py:587
def get_function(self)
Return user-defined function for calculation distance metric.
Definition: metric.py:170
def gower_distance_numpy(point1, point2, max_range)
Calculate Gower distance between two vectors using numpy.
Definition: metric.py:618
def disable_numpy_usage(self)
Stop using numpy for distance calculation.
Definition: metric.py:191
def canberra_distance(point1, point2)
Calculate Canberra distance between two vectors.
Definition: metric.py:501
def canberra_distance_numpy(object1, object2)
Calculate Canberra distance between two objects using numpy.
Definition: metric.py:526
def euclidean_distance_square_numpy(object1, object2)
Calculate square Euclidean distance between two objects using numpy.
Definition: metric.py:372
def __call__(self, point1, point2)
Calculates distance between two points.
Definition: metric.py:137
def __create_distance_calculator_numpy(self)
Creates distance metric calculator that uses numpy.
Definition: metric.py:273
def euclidean_distance(point1, point2)
Calculate Euclidean distance between two vectors.
Definition: metric.py:313
def manhattan_distance(point1, point2)
Calculate Manhattan distance between between two vectors.
Definition: metric.py:385
def minkowski_distance(point1, point2, degree=2)
Calculate Minkowski distance between two vectors.
Definition: metric.py:463
def euclidean_distance_numpy(object1, object2)
Calculate Euclidean distance between two objects using numpy.
Definition: metric.py:336
def enable_numpy_usage(self)
Start numpy for distance calculation.
Definition: metric.py:180
def __get_gower_max_range(self)
Returns max range for Gower distance using input parameters (&#39;max_range&#39; or &#39;data&#39;).
Definition: metric.py:254
Enumeration of supported metrics in the module for distance calculation between two points...
Definition: metric.py:32
def chebyshev_distance(point1, point2)
Calculate Chebyshev distance (maximum metric) between between two vectors.
Definition: metric.py:423