metric.py
1 """!
2 
3 @brief Module provides various distance metrics - abstraction of the notion of distance in a metric space.
4 
5 @authors Andrei Novikov (pyclustering@yandex.ru)
6 @date 2014-2020
7 @copyright GNU Public License
8 
9 @cond GNU_PUBLIC_LICENSE
10  PyClustering is free software: you can redistribute it and/or modify
11  it under the terms of the GNU General Public License as published by
12  the Free Software Foundation, either version 3 of the License, or
13  (at your option) any later version.
14 
15  PyClustering is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  GNU General Public License for more details.
19 
20  You should have received a copy of the GNU General Public License
21  along with this program. If not, see <http://www.gnu.org/licenses/>.
22 @endcond
23 
24 """
25 
26 
27 import numpy
28 
29 from enum import IntEnum
30 
31 
32 class type_metric(IntEnum):
33  """!
34  @brief Enumeration of supported metrics in the module for distance calculation between two points.
35 
36  """
37 
38 
39  EUCLIDEAN = 0
40 
41 
42  EUCLIDEAN_SQUARE = 1
43 
44 
45  MANHATTAN = 2
46 
47 
48  CHEBYSHEV = 3
49 
50 
51  MINKOWSKI = 4
52 
53 
54  CANBERRA = 5
55 
56 
57  CHI_SQUARE = 6
58 
59 
60  GOWER = 7
61 
62 
63  USER_DEFINED = 1000
64 
65 
66 
68  """!
69  @brief Distance metric performs distance calculation between two points in line with encapsulated function, for
70  example, euclidean distance or chebyshev distance, or even user-defined.
71 
72  @details
73 
74  Example of Euclidean distance metric:
75  @code
76  metric = distance_metric(type_metric.EUCLIDEAN)
77  distance = metric([1.0, 2.5], [-1.2, 3.4])
78  @endcode
79 
80  Example of Chebyshev distance metric:
81  @code
82  metric = distance_metric(type_metric.CHEBYSHEV)
83  distance = metric([0.0, 0.0], [2.5, 6.0])
84  @endcode
85 
86  In following example additional argument should be specified (generally, 'degree' is a optional argument that is
87  equal to '2' by default) that is specific for Minkowski distance:
88  @code
89  metric = distance_metric(type_metric.MINKOWSKI, degree=4)
90  distance = metric([4.0, 9.2, 1.0], [3.4, 2.5, 6.2])
91  @endcode
92 
93  User may define its own function for distance calculation. In this case input is two points, for example, you
94  want to implement your own version of Manhattan distance:
95  @code
96  from pyclustering.utils.metric import distance_metric, type_metric
97 
98  def my_manhattan(point1, point2):
99  dimension = len(point1)
100  result = 0.0
101  for i in range(dimension):
102  result += abs(point1[i] - point2[i]) * 0.1
103  return result
104 
105  metric = distance_metric(type_metric.USER_DEFINED, func=my_manhattan)
106  distance = metric([2.0, 3.0], [1.0, 3.0])
107  @endcode
108 
109  """
110  def __init__(self, metric_type, **kwargs):
111  """!
112  @brief Creates distance metric instance for calculation distance between two points.
113 
114  @param[in] metric_type (type_metric):
115  @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'numpy_usage' 'func' and corresponding additional argument for
116  for specific metric types).
117 
118  <b>Keyword Args:</b><br>
119  - func (callable): Callable object with two arguments (point #1 and point #2) or (object #1 and object #2) in case of numpy usage.
120  This argument is used only if metric is 'type_metric.USER_DEFINED'.
121  - degree (numeric): Only for 'type_metric.MINKOWSKI' - degree of Minkowski equation.
122  - max_range (array_like): Only for 'type_metric.GOWER' - max range in each dimension. 'data' can be used
123  instead of this parameter.
124  - data (array_like): Only for 'type_metric.GOWER' - input data that used for 'max_range' calculation.
125  'max_range' can be used instead of this parameter.
126  - numpy_usage (bool): If True then numpy is used for calculation (by default is False).
127 
128  """
129  self.__type = metric_type
130  self.__args = kwargs
131  self.__func = self.__args.get('func', None)
132  self.__numpy = self.__args.get('numpy_usage', False)
133 
135 
136 
137  def __call__(self, point1, point2):
138  """!
139  @brief Calculates distance between two points.
140 
141  @param[in] point1 (list): The first point.
142  @param[in] point2 (list): The second point.
143 
144  @return (double) Distance between two points.
145 
146  """
147  return self.__calculator(point1, point2)
148 
149 
150  def get_type(self):
151  """!
152  @brief Return type of distance metric that is used.
153 
154  @return (type_metric) Type of distance metric.
155 
156  """
157  return self.__type
158 
159 
160  def get_arguments(self):
161  """!
162  @brief Return additional arguments that are used by distance metric.
163 
164  @return (dict) Additional arguments.
165 
166  """
167  return self.__args
168 
169 
170  def get_function(self):
171  """!
172  @brief Return user-defined function for calculation distance metric.
173 
174  @return (callable): User-defined distance metric function.
175 
176  """
177  return self.__func
178 
179 
181  """!
182  @brief Start numpy for distance calculation.
183  @details Useful in case matrices to increase performance. No effect in case of type_metric.USER_DEFINED type.
184 
185  """
186  self.__numpy = True
187  if self.__type != type_metric.USER_DEFINED:
189 
190 
192  """!
193  @brief Stop using numpy for distance calculation.
194  @details Useful in case of big amount of small data portion when numpy call is longer than calculation itself.
195  No effect in case of type_metric.USER_DEFINED type.
196 
197  """
198  self.__numpy = False
200 
201 
202  def __create_distance_calculator(self):
203  """!
204  @brief Creates distance metric calculator.
205 
206  @return (callable) Callable object of distance metric calculator.
207 
208  """
209  if self.__numpy is True:
211 
213 
214 
215  def __create_distance_calculator_basic(self):
216  """!
217  @brief Creates distance metric calculator that does not use numpy.
218 
219  @return (callable) Callable object of distance metric calculator.
220 
221  """
222  if self.__type == type_metric.EUCLIDEAN:
223  return euclidean_distance
224 
225  elif self.__type == type_metric.EUCLIDEAN_SQUARE:
226  return euclidean_distance_square
227 
228  elif self.__type == type_metric.MANHATTAN:
229  return manhattan_distance
230 
231  elif self.__type == type_metric.CHEBYSHEV:
232  return chebyshev_distance
233 
234  elif self.__type == type_metric.MINKOWSKI:
235  return lambda point1, point2: minkowski_distance(point1, point2, self.__args.get('degree', 2))
236 
237  elif self.__type == type_metric.CANBERRA:
238  return canberra_distance
239 
240  elif self.__type == type_metric.CHI_SQUARE:
241  return chi_square_distance
242 
243  elif self.__type == type_metric.GOWER:
244  max_range = self.__get_gower_max_range()
245  return lambda point1, point2: gower_distance(point1, point2, max_range)
246 
247  elif self.__type == type_metric.USER_DEFINED:
248  return self.__func
249 
250  else:
251  raise ValueError("Unknown type of metric: '%d'", self.__type)
252 
253 
254  def __get_gower_max_range(self):
255  """!
256  @brief Returns max range for Gower distance using input parameters ('max_range' or 'data').
257 
258  @return (numpy.array) Max range for Gower distance.
259 
260  """
261  max_range = self.__args.get('max_range', None)
262  if max_range is None:
263  data = self.__args.get('data', None)
264  if data is None:
265  raise ValueError("Gower distance requires 'data' or 'max_range' argument to construct metric.")
266 
267  max_range = numpy.max(data, axis=0) - numpy.min(data, axis=0)
268  self.__args['max_range'] = max_range
269 
270  return max_range
271 
272 
273  def __create_distance_calculator_numpy(self):
274  """!
275  @brief Creates distance metric calculator that uses numpy.
276 
277  @return (callable) Callable object of distance metric calculator.
278 
279  """
280  if self.__type == type_metric.EUCLIDEAN:
281  return euclidean_distance_numpy
282 
283  elif self.__type == type_metric.EUCLIDEAN_SQUARE:
284  return euclidean_distance_square_numpy
285 
286  elif self.__type == type_metric.MANHATTAN:
287  return manhattan_distance_numpy
288 
289  elif self.__type == type_metric.CHEBYSHEV:
290  return chebyshev_distance_numpy
291 
292  elif self.__type == type_metric.MINKOWSKI:
293  return lambda object1, object2: minkowski_distance_numpy(object1, object2, self.__args.get('degree', 2))
294 
295  elif self.__type == type_metric.CANBERRA:
296  return canberra_distance_numpy
297 
298  elif self.__type == type_metric.CHI_SQUARE:
299  return chi_square_distance_numpy
300 
301  elif self.__type == type_metric.GOWER:
302  max_range = self.__get_gower_max_range()
303  return lambda object1, object2: gower_distance_numpy(object1, object2, max_range)
304 
305  elif self.__type == type_metric.USER_DEFINED:
306  return self.__func
307 
308  else:
309  raise ValueError("Unknown type of metric: '%d'", self.__type)
310 
311 
312 
313 def euclidean_distance(point1, point2):
314  """!
315  @brief Calculate Euclidean distance between two vectors.
316  @details The Euclidean between vectors (points) a and b is calculated by following formula:
317 
318  \f[
319  dist(a, b) = \sqrt{ \sum_{i=0}^{N}(a_{i} - b_{i})^{2} };
320  \f]
321 
322  Where N is a length of each vector.
323 
324  @param[in] point1 (array_like): The first vector.
325  @param[in] point2 (array_like): The second vector.
326 
327  @return (double) Euclidean distance between two vectors.
328 
329  @see euclidean_distance_square, manhattan_distance, chebyshev_distance
330 
331  """
332  distance = euclidean_distance_square(point1, point2)
333  return distance ** 0.5
334 
335 
336 def euclidean_distance_numpy(object1, object2):
337  """!
338  @brief Calculate Euclidean distance between two objects using numpy.
339 
340  @param[in] object1 (array_like): The first array_like object.
341  @param[in] object2 (array_like): The second array_like object.
342 
343  @return (double) Euclidean distance between two objects.
344 
345  """
346  return numpy.sum(numpy.sqrt(numpy.square(object1 - object2)), axis=1).T
347 
348 
349 def euclidean_distance_square(point1, point2):
350  """!
351  @brief Calculate square Euclidean distance between two vectors.
352 
353  \f[
354  dist(a, b) = \sum_{i=0}^{N}(a_{i} - b_{i})^{2};
355  \f]
356 
357  @param[in] point1 (array_like): The first vector.
358  @param[in] point2 (array_like): The second vector.
359 
360  @return (double) Square Euclidean distance between two vectors.
361 
362  @see euclidean_distance, manhattan_distance, chebyshev_distance
363 
364  """
365  distance = 0.0
366  for i in range(len(point1)):
367  distance += (point1[i] - point2[i]) ** 2.0
368 
369  return distance
370 
371 
372 def euclidean_distance_square_numpy(object1, object2):
373  """!
374  @brief Calculate square Euclidean distance between two objects using numpy.
375 
376  @param[in] object1 (array_like): The first array_like object.
377  @param[in] object2 (array_like): The second array_like object.
378 
379  @return (double) Square Euclidean distance between two objects.
380 
381  """
382  return numpy.sum(numpy.square(object1 - object2), axis=1).T
383 
384 
385 def manhattan_distance(point1, point2):
386  """!
387  @brief Calculate Manhattan distance between between two vectors.
388 
389  \f[
390  dist(a, b) = \sum_{i=0}^{N}\left | a_{i} - b_{i} \right |;
391  \f]
392 
393  @param[in] point1 (array_like): The first vector.
394  @param[in] point2 (array_like): The second vector.
395 
396  @return (double) Manhattan distance between two vectors.
397 
398  @see euclidean_distance_square, euclidean_distance, chebyshev_distance
399 
400  """
401  distance = 0.0
402  dimension = len(point1)
403 
404  for i in range(dimension):
405  distance += abs(point1[i] - point2[i])
406 
407  return distance
408 
409 
410 def manhattan_distance_numpy(object1, object2):
411  """!
412  @brief Calculate Manhattan distance between two objects using numpy.
413 
414  @param[in] object1 (array_like): The first array_like object.
415  @param[in] object2 (array_like): The second array_like object.
416 
417  @return (double) Manhattan distance between two objects.
418 
419  """
420  return numpy.sum(numpy.absolute(object1 - object2), axis=1).T
421 
422 
423 def chebyshev_distance(point1, point2):
424  """!
425  @brief Calculate Chebyshev distance (maximum metric) between between two vectors.
426  @details Chebyshev distance is a metric defined on a vector space where the distance between two vectors is the
427  greatest of their differences along any coordinate dimension.
428 
429  \f[
430  dist(a, b) = \max_{}i\left (\left | a_{i} - b_{i} \right |\right );
431  \f]
432 
433  @param[in] point1 (array_like): The first vector.
434  @param[in] point2 (array_like): The second vector.
435 
436  @return (double) Chebyshev distance between two vectors.
437 
438  @see euclidean_distance_square, euclidean_distance, minkowski_distance
439 
440  """
441  distance = 0.0
442  dimension = len(point1)
443 
444  for i in range(dimension):
445  distance = max(distance, abs(point1[i] - point2[i]))
446 
447  return distance
448 
449 
450 def chebyshev_distance_numpy(object1, object2):
451  """!
452  @brief Calculate Chebyshev distance between two objects using numpy.
453 
454  @param[in] object1 (array_like): The first array_like object.
455  @param[in] object2 (array_like): The second array_like object.
456 
457  @return (double) Chebyshev distance between two objects.
458 
459  """
460  return numpy.max(numpy.absolute(object1 - object2), axis=1).T
461 
462 
463 def minkowski_distance(point1, point2, degree=2):
464  """!
465  @brief Calculate Minkowski distance between two vectors.
466 
467  \f[
468  dist(a, b) = \sqrt[p]{ \sum_{i=0}^{N}\left(a_{i} - b_{i}\right)^{p} };
469  \f]
470 
471  @param[in] point1 (array_like): The first vector.
472  @param[in] point2 (array_like): The second vector.
473  @param[in] degree (numeric): Degree of that is used for Minkowski distance.
474 
475  @return (double) Minkowski distance between two vectors.
476 
477  @see euclidean_distance
478 
479  """
480  distance = 0.0
481  for i in range(len(point1)):
482  distance += (point1[i] - point2[i]) ** degree
483 
484  return distance ** (1.0 / degree)
485 
486 
487 def minkowski_distance_numpy(object1, object2, degree=2):
488  """!
489  @brief Calculate Minkowski distance between objects using numpy.
490 
491  @param[in] object1 (array_like): The first array_like object.
492  @param[in] object2 (array_like): The second array_like object.
493  @param[in] degree (numeric): Degree of that is used for Minkowski distance.
494 
495  @return (double) Minkowski distance between two object.
496 
497  """
498  return numpy.sum(numpy.power(numpy.power(object1 - object2, degree), 1/degree), axis=1).T
499 
500 
501 def canberra_distance(point1, point2):
502  """!
503  @brief Calculate Canberra distance between two vectors.
504 
505  \f[
506  dist(a, b) = \sum_{i=0}^{N}\frac{\left | a_{i} - b_{i} \right |}{\left | a_{i} \right | + \left | b_{i} \right |};
507  \f]
508 
509  @param[in] point1 (array_like): The first vector.
510  @param[in] point2 (array_like): The second vector.
511 
512  @return (float) Canberra distance between two objects.
513 
514  """
515  distance = 0.0
516  for i in range(len(point1)):
517  divider = abs(point1[i]) + abs(point2[i])
518  if divider == 0.0:
519  continue
520 
521  distance += abs(point1[i] - point2[i]) / divider
522 
523  return distance
524 
525 
526 def canberra_distance_numpy(object1, object2):
527  """!
528  @brief Calculate Canberra distance between two objects using numpy.
529 
530  @param[in] object1 (array_like): The first vector.
531  @param[in] object2 (array_like): The second vector.
532 
533  @return (float) Canberra distance between two objects.
534 
535  """
536  with numpy.errstate(divide='ignore', invalid='ignore'):
537  result = numpy.divide(numpy.abs(object1 - object2), numpy.abs(object1) + numpy.abs(object2))
538 
539  if len(result.shape) > 1:
540  return numpy.sum(numpy.nan_to_num(result), axis=1).T
541  else:
542  return numpy.sum(numpy.nan_to_num(result))
543 
544 
545 def chi_square_distance(point1, point2):
546  """!
547  @brief Calculate Chi square distance between two vectors.
548 
549  \f[
550  dist(a, b) = \sum_{i=0}^{N}\frac{\left ( a_{i} - b_{i} \right )^{2}}{\left | a_{i} \right | + \left | b_{i} \right |};
551  \f]
552 
553  @param[in] point1 (array_like): The first vector.
554  @param[in] point2 (array_like): The second vector.
555 
556  @return (float) Chi square distance between two objects.
557 
558  """
559  distance = 0.0
560  for i in range(len(point1)):
561  divider = abs(point1[i]) + abs(point2[i])
562  if divider != 0.0:
563  distance += ((point1[i] - point2[i]) ** 2.0) / divider
564 
565  return distance
566 
567 
568 def chi_square_distance_numpy(object1, object2):
569  """!
570  @brief Calculate Chi square distance between two vectors using numpy.
571 
572  @param[in] object1 (array_like): The first vector.
573  @param[in] object2 (array_like): The second vector.
574 
575  @return (float) Chi square distance between two objects.
576 
577  """
578  with numpy.errstate(divide='ignore', invalid='ignore'):
579  result = numpy.divide(numpy.power(object1 - object2, 2), numpy.abs(object1) + numpy.abs(object2))
580 
581  if len(result.shape) > 1:
582  return numpy.sum(numpy.nan_to_num(result), axis=1).T
583  else:
584  return numpy.sum(numpy.nan_to_num(result))
585 
586 
587 def gower_distance(point1, point2, max_range):
588  """!
589  @brief Calculate Gower distance between two vectors.
590  @details Implementation is based on the paper @cite article::utils::metric::gower. Gower distance is calculate
591  using following formula:
592  \f[
593  dist\left ( a, b \right )=\frac{1}{p}\sum_{i=0}^{p}\frac{\left | a_{i} - b_{i} \right |}{R_{i}},
594  \f]
595 
596  where \f$R_{i}\f$ is a max range for ith dimension. \f$R\f$ is defined in line following formula:
597 
598  \f[
599  R=max\left ( X \right )-min\left ( X \right )
600  \f]
601 
602  @param[in] point1 (array_like): The first vector.
603  @param[in] point2 (array_like): The second vector.
604  @param[in] max_range (array_like): Max range in each data dimension.
605 
606  @return (float) Gower distance between two objects.
607 
608  """
609  distance = 0.0
610  dimensions = len(point1)
611  for i in range(dimensions):
612  if max_range[i] != 0.0:
613  distance += abs(point1[i] - point2[i]) / max_range[i]
614 
615  return distance / dimensions
616 
617 
618 def gower_distance_numpy(point1, point2, max_range):
619  """!
620  @brief Calculate Gower distance between two vectors using numpy.
621 
622  @param[in] point1 (array_like): The first vector.
623  @param[in] point2 (array_like): The second vector.
624  @param[in] max_range (array_like): Max range in each data dimension.
625 
626  @return (float) Gower distance between two objects.
627 
628  """
629  with numpy.errstate(divide='ignore', invalid='ignore'):
630  result = numpy.divide(numpy.abs(point1 - point2), max_range)
631 
632  if len(result.shape) > 1:
633  return numpy.sum(numpy.nan_to_num(result), axis=1).T / len(point1)
634  else:
635  return numpy.sum(numpy.nan_to_num(result)) / len(point1)
def __create_distance_calculator_basic(self)
Creates distance metric calculator that does not use numpy.
Definition: metric.py:215
def chi_square_distance(point1, point2)
Calculate Chi square distance between two vectors.
Definition: metric.py:545
def get_arguments(self)
Return additional arguments that are used by distance metric.
Definition: metric.py:160
def euclidean_distance_square(point1, point2)
Calculate square Euclidean distance between two vectors.
Definition: metric.py:349
def minkowski_distance_numpy(object1, object2, degree=2)
Calculate Minkowski distance between objects using numpy.
Definition: metric.py:487
def chi_square_distance_numpy(object1, object2)
Calculate Chi square distance between two vectors using numpy.
Definition: metric.py:568
def __create_distance_calculator(self)
Creates distance metric calculator.
Definition: metric.py:202
def get_type(self)
Return type of distance metric that is used.
Definition: metric.py:150
def chebyshev_distance_numpy(object1, object2)
Calculate Chebyshev distance between two objects using numpy.
Definition: metric.py:450
Distance metric performs distance calculation between two points in line with encapsulated function...
Definition: metric.py:67
def manhattan_distance_numpy(object1, object2)
Calculate Manhattan distance between two objects using numpy.
Definition: metric.py:410
def __init__(self, metric_type, kwargs)
Creates distance metric instance for calculation distance between two points.
Definition: metric.py:110
def gower_distance(point1, point2, max_range)
Calculate Gower distance between two vectors.
Definition: metric.py:587
def get_function(self)
Return user-defined function for calculation distance metric.
Definition: metric.py:170
def gower_distance_numpy(point1, point2, max_range)
Calculate Gower distance between two vectors using numpy.
Definition: metric.py:618
def disable_numpy_usage(self)
Stop using numpy for distance calculation.
Definition: metric.py:191
def canberra_distance(point1, point2)
Calculate Canberra distance between two vectors.
Definition: metric.py:501
def canberra_distance_numpy(object1, object2)
Calculate Canberra distance between two objects using numpy.
Definition: metric.py:526
def euclidean_distance_square_numpy(object1, object2)
Calculate square Euclidean distance between two objects using numpy.
Definition: metric.py:372
def __call__(self, point1, point2)
Calculates distance between two points.
Definition: metric.py:137
def __create_distance_calculator_numpy(self)
Creates distance metric calculator that uses numpy.
Definition: metric.py:273
def euclidean_distance(point1, point2)
Calculate Euclidean distance between two vectors.
Definition: metric.py:313
def manhattan_distance(point1, point2)
Calculate Manhattan distance between between two vectors.
Definition: metric.py:385
def minkowski_distance(point1, point2, degree=2)
Calculate Minkowski distance between two vectors.
Definition: metric.py:463
def euclidean_distance_numpy(object1, object2)
Calculate Euclidean distance between two objects using numpy.
Definition: metric.py:336
def enable_numpy_usage(self)
Start numpy for distance calculation.
Definition: metric.py:180
def __get_gower_max_range(self)
Returns max range for Gower distance using input parameters (&#39;max_range&#39; or &#39;data&#39;).
Definition: metric.py:254
Enumeration of supported metrics in the module for distance calculation between two points...
Definition: metric.py:32
def chebyshev_distance(point1, point2)
Calculate Chebyshev distance (maximum metric) between between two vectors.
Definition: metric.py:423