3 @brief Module provides various distance metrics - abstraction of the notion of distance in a metric space.
5 @authors Andrei Novikov (pyclustering@yandex.ru)
7 @copyright BSD-3-Clause
14 from enum
import IntEnum
19 @brief Enumeration of supported metrics in the module for distance calculation between two points.
54 @brief Distance metric performs distance calculation between two points in line with encapsulated function, for
55 example, euclidean distance or chebyshev distance, or even user-defined.
59 Example of Euclidean distance metric:
61 metric = distance_metric(type_metric.EUCLIDEAN)
62 distance = metric([1.0, 2.5], [-1.2, 3.4])
65 Example of Chebyshev distance metric:
67 metric = distance_metric(type_metric.CHEBYSHEV)
68 distance = metric([0.0, 0.0], [2.5, 6.0])
71 In following example additional argument should be specified (generally, 'degree' is a optional argument that is
72 equal to '2' by default) that is specific for Minkowski distance:
74 metric = distance_metric(type_metric.MINKOWSKI, degree=4)
75 distance = metric([4.0, 9.2, 1.0], [3.4, 2.5, 6.2])
78 User may define its own function for distance calculation. In this case input is two points, for example, you
79 want to implement your own version of Manhattan distance:
81 from pyclustering.utils.metric import distance_metric, type_metric
83 def my_manhattan(point1, point2):
84 dimension = len(point1)
86 for i in range(dimension):
87 result += abs(point1[i] - point2[i]) * 0.1
90 metric = distance_metric(type_metric.USER_DEFINED, func=my_manhattan)
91 distance = metric([2.0, 3.0], [1.0, 3.0])
97 @brief Creates distance metric instance for calculation distance between two points.
99 @param[in] metric_type (type_metric):
100 @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'numpy_usage' 'func' and corresponding additional argument for
101 for specific metric types).
103 <b>Keyword Args:</b><br>
104 - func (callable): Callable object with two arguments (point #1 and point #2) or (object #1 and object #2) in case of numpy usage.
105 This argument is used only if metric is 'type_metric.USER_DEFINED'.
106 - degree (numeric): Only for 'type_metric.MINKOWSKI' - degree of Minkowski equation.
107 - max_range (array_like): Only for 'type_metric.GOWER' - max range in each dimension. 'data' can be used
108 instead of this parameter.
109 - data (array_like): Only for 'type_metric.GOWER' - input data that used for 'max_range' calculation.
110 'max_range' can be used instead of this parameter.
111 - numpy_usage (bool): If True then numpy is used for calculation (by default is False).
124 @brief Calculates distance between two points.
126 @param[in] point1 (list): The first point.
127 @param[in] point2 (list): The second point.
129 @return (double) Distance between two points.
137 @brief Return type of distance metric that is used.
139 @return (type_metric) Type of distance metric.
147 @brief Return additional arguments that are used by distance metric.
149 @return (dict) Additional arguments.
157 @brief Return user-defined function for calculation distance metric.
159 @return (callable): User-defined distance metric function.
167 @brief Start numpy for distance calculation.
168 @details Useful in case matrices to increase performance. No effect in case of type_metric.USER_DEFINED type.
172 if self.
__type != type_metric.USER_DEFINED:
178 @brief Stop using numpy for distance calculation.
179 @details Useful in case of big amount of small data portion when numpy call is longer than calculation itself.
180 No effect in case of type_metric.USER_DEFINED type.
187 def __create_distance_calculator(self):
189 @brief Creates distance metric calculator.
191 @return (callable) Callable object of distance metric calculator.
200 def __create_distance_calculator_basic(self):
202 @brief Creates distance metric calculator that does not use numpy.
204 @return (callable) Callable object of distance metric calculator.
207 if self.
__type == type_metric.EUCLIDEAN:
208 return euclidean_distance
210 elif self.
__type == type_metric.EUCLIDEAN_SQUARE:
211 return euclidean_distance_square
213 elif self.
__type == type_metric.MANHATTAN:
214 return manhattan_distance
216 elif self.
__type == type_metric.CHEBYSHEV:
217 return chebyshev_distance
219 elif self.
__type == type_metric.MINKOWSKI:
222 elif self.
__type == type_metric.CANBERRA:
223 return canberra_distance
225 elif self.
__type == type_metric.CHI_SQUARE:
226 return chi_square_distance
228 elif self.
__type == type_metric.GOWER:
230 return lambda point1, point2:
gower_distance(point1, point2, max_range)
232 elif self.
__type == type_metric.USER_DEFINED:
236 raise ValueError(
"Unknown type of metric: '%d'", self.
__type)
239 def __get_gower_max_range(self):
241 @brief Returns max range for Gower distance using input parameters ('max_range' or 'data').
243 @return (numpy.array) Max range for Gower distance.
246 max_range = self.
__args.get(
'max_range',
None)
247 if max_range
is None:
248 data = self.
__args.get(
'data',
None)
250 raise ValueError(
"Gower distance requires 'data' or 'max_range' argument to construct metric.")
252 max_range = numpy.max(data, axis=0) - numpy.min(data, axis=0)
253 self.
__args[
'max_range'] = max_range
258 def __create_distance_calculator_numpy(self):
260 @brief Creates distance metric calculator that uses numpy.
262 @return (callable) Callable object of distance metric calculator.
265 if self.
__type == type_metric.EUCLIDEAN:
266 return euclidean_distance_numpy
268 elif self.
__type == type_metric.EUCLIDEAN_SQUARE:
269 return euclidean_distance_square_numpy
271 elif self.
__type == type_metric.MANHATTAN:
272 return manhattan_distance_numpy
274 elif self.
__type == type_metric.CHEBYSHEV:
275 return chebyshev_distance_numpy
277 elif self.
__type == type_metric.MINKOWSKI:
280 elif self.
__type == type_metric.CANBERRA:
281 return canberra_distance_numpy
283 elif self.
__type == type_metric.CHI_SQUARE:
284 return chi_square_distance_numpy
286 elif self.
__type == type_metric.GOWER:
290 elif self.
__type == type_metric.USER_DEFINED:
294 raise ValueError(
"Unknown type of metric: '%d'", self.
__type)
300 @brief Calculate Euclidean distance between two vectors.
301 @details The Euclidean between vectors (points) a and b is calculated by following formula:
304 dist(a, b) = \sqrt{ \sum_{i=0}^{N}(a_{i} - b_{i})^{2} };
307 Where N is a length of each vector.
309 @param[in] point1 (array_like): The first vector.
310 @param[in] point2 (array_like): The second vector.
312 @return (double) Euclidean distance between two vectors.
314 @see euclidean_distance_square, manhattan_distance, chebyshev_distance
318 return distance ** 0.5
323 @brief Calculate Euclidean distance between two objects using numpy.
325 @param[in] object1 (array_like): The first array_like object.
326 @param[in] object2 (array_like): The second array_like object.
328 @return (double) Euclidean distance between two objects.
331 if len(object1.shape) > 1
or len(object2.shape) > 1:
332 return numpy.sqrt(numpy.sum(numpy.square(object1 - object2), axis=1))
334 return numpy.sqrt(numpy.sum(numpy.square(object1 - object2)))
339 @brief Calculate square Euclidean distance between two vectors.
342 dist(a, b) = \sum_{i=0}^{N}(a_{i} - b_{i})^{2};
345 @param[in] point1 (array_like): The first vector.
346 @param[in] point2 (array_like): The second vector.
348 @return (double) Square Euclidean distance between two vectors.
350 @see euclidean_distance, manhattan_distance, chebyshev_distance
354 for i
in range(len(point1)):
355 distance += (point1[i] - point2[i]) ** 2.0
362 @brief Calculate square Euclidean distance between two objects using numpy.
364 @param[in] object1 (array_like): The first array_like object.
365 @param[in] object2 (array_like): The second array_like object.
367 @return (double) Square Euclidean distance between two objects.
370 if len(object1.shape) > 1
or len(object2.shape) > 1:
371 return numpy.sum(numpy.square(object1 - object2), axis=1).T
373 return numpy.sum(numpy.square(object1 - object2))
378 @brief Calculate Manhattan distance between between two vectors.
381 dist(a, b) = \sum_{i=0}^{N}\left | a_{i} - b_{i} \right |;
384 @param[in] point1 (array_like): The first vector.
385 @param[in] point2 (array_like): The second vector.
387 @return (double) Manhattan distance between two vectors.
389 @see euclidean_distance_square, euclidean_distance, chebyshev_distance
393 dimension = len(point1)
395 for i
in range(dimension):
396 distance += abs(point1[i] - point2[i])
403 @brief Calculate Manhattan distance between two objects using numpy.
405 @param[in] object1 (array_like): The first array_like object.
406 @param[in] object2 (array_like): The second array_like object.
408 @return (double) Manhattan distance between two objects.
411 if len(object1.shape) > 1
or len(object2.shape) > 1:
412 return numpy.sum(numpy.absolute(object1 - object2), axis=1).T
414 return numpy.sum(numpy.absolute(object1 - object2))
419 @brief Calculate Chebyshev distance (maximum metric) between between two vectors.
420 @details Chebyshev distance is a metric defined on a vector space where the distance between two vectors is the
421 greatest of their differences along any coordinate dimension.
424 dist(a, b) = \max_{}i\left (\left | a_{i} - b_{i} \right |\right );
427 @param[in] point1 (array_like): The first vector.
428 @param[in] point2 (array_like): The second vector.
430 @return (double) Chebyshev distance between two vectors.
432 @see euclidean_distance_square, euclidean_distance, minkowski_distance
436 dimension = len(point1)
438 for i
in range(dimension):
439 distance = max(distance, abs(point1[i] - point2[i]))
446 @brief Calculate Chebyshev distance between two objects using numpy.
448 @param[in] object1 (array_like): The first array_like object.
449 @param[in] object2 (array_like): The second array_like object.
451 @return (double) Chebyshev distance between two objects.
454 if len(object1.shape) > 1
or len(object2.shape) > 1:
455 return numpy.max(numpy.absolute(object1 - object2), axis=1).T
457 return numpy.max(numpy.absolute(object1 - object2))
462 @brief Calculate Minkowski distance between two vectors.
465 dist(a, b) = \sqrt[p]{ \sum_{i=0}^{N}\left(a_{i} - b_{i}\right)^{p} };
468 @param[in] point1 (array_like): The first vector.
469 @param[in] point2 (array_like): The second vector.
470 @param[in] degree (numeric): Degree of that is used for Minkowski distance.
472 @return (double) Minkowski distance between two vectors.
474 @see euclidean_distance
478 for i
in range(len(point1)):
479 distance += (point1[i] - point2[i]) ** degree
481 return distance ** (1.0 / degree)
486 @brief Calculate Minkowski distance between objects using numpy.
488 @param[in] object1 (array_like): The first array_like object.
489 @param[in] object2 (array_like): The second array_like object.
490 @param[in] degree (numeric): Degree of that is used for Minkowski distance.
492 @return (double) Minkowski distance between two object.
495 if len(object1.shape) > 1
or len(object2.shape) > 1:
496 return numpy.power(numpy.sum(numpy.power(object1 - object2, degree), axis=1), 1/degree)
498 return numpy.power(numpy.sum(numpy.power(object1 - object2, degree)), 1 / degree)
503 @brief Calculate Canberra distance between two vectors.
506 dist(a, b) = \sum_{i=0}^{N}\frac{\left | a_{i} - b_{i} \right |}{\left | a_{i} \right | + \left | b_{i} \right |};
509 @param[in] point1 (array_like): The first vector.
510 @param[in] point2 (array_like): The second vector.
512 @return (float) Canberra distance between two objects.
516 for i
in range(len(point1)):
517 divider = abs(point1[i]) + abs(point2[i])
521 distance += abs(point1[i] - point2[i]) / divider
528 @brief Calculate Canberra distance between two objects using numpy.
530 @param[in] object1 (array_like): The first vector.
531 @param[in] object2 (array_like): The second vector.
533 @return (float) Canberra distance between two objects.
536 with numpy.errstate(divide=
'ignore', invalid=
'ignore'):
537 result = numpy.divide(numpy.abs(object1 - object2), numpy.abs(object1) + numpy.abs(object2))
539 if len(result.shape) > 1:
540 return numpy.sum(numpy.nan_to_num(result), axis=1).T
542 return numpy.sum(numpy.nan_to_num(result))
547 @brief Calculate Chi square distance between two vectors.
550 dist(a, b) = \sum_{i=0}^{N}\frac{\left ( a_{i} - b_{i} \right )^{2}}{\left | a_{i} \right | + \left | b_{i} \right |};
553 @param[in] point1 (array_like): The first vector.
554 @param[in] point2 (array_like): The second vector.
556 @return (float) Chi square distance between two objects.
560 for i
in range(len(point1)):
561 divider = abs(point1[i]) + abs(point2[i])
563 distance += ((point1[i] - point2[i]) ** 2.0) / divider
570 @brief Calculate Chi square distance between two vectors using numpy.
572 @param[in] object1 (array_like): The first vector.
573 @param[in] object2 (array_like): The second vector.
575 @return (float) Chi square distance between two objects.
578 with numpy.errstate(divide=
'ignore', invalid=
'ignore'):
579 result = numpy.divide(numpy.power(object1 - object2, 2), numpy.abs(object1) + numpy.abs(object2))
581 if len(result.shape) > 1:
582 return numpy.sum(numpy.nan_to_num(result), axis=1).T
584 return numpy.sum(numpy.nan_to_num(result))
589 @brief Calculate Gower distance between two vectors.
590 @details Implementation is based on the paper @cite article::utils::metric::gower. Gower distance is calculate
591 using following formula:
593 dist\left ( a, b \right )=\frac{1}{p}\sum_{i=0}^{p}\frac{\left | a_{i} - b_{i} \right |}{R_{i}},
596 where \f$R_{i}\f$ is a max range for ith dimension. \f$R\f$ is defined in line following formula:
599 R=max\left ( X \right )-min\left ( X \right )
602 @param[in] point1 (array_like): The first vector.
603 @param[in] point2 (array_like): The second vector.
604 @param[in] max_range (array_like): Max range in each data dimension.
606 @return (float) Gower distance between two objects.
610 dimensions = len(point1)
611 for i
in range(dimensions):
612 if max_range[i] != 0.0:
613 distance += abs(point1[i] - point2[i]) / max_range[i]
615 return distance / dimensions
620 @brief Calculate Gower distance between two vectors using numpy.
622 @param[in] point1 (array_like): The first vector.
623 @param[in] point2 (array_like): The second vector.
624 @param[in] max_range (array_like): Max range in each data dimension.
626 @return (float) Gower distance between two objects.
629 with numpy.errstate(divide=
'ignore', invalid=
'ignore'):
630 result = numpy.divide(numpy.abs(point1 - point2), max_range)
632 if len(result.shape) > 1:
633 return numpy.sum(numpy.nan_to_num(result), axis=1).T / len(result[0])
635 return numpy.sum(numpy.nan_to_num(result)) / len(point1)