pyclustering  0.10.1
pyclustring is a Python, C++ data mining library.
sampling.py
1 """!
2 
3 @brief Module provides various random sampling algorithms.
4 
5 @authors Andrei Novikov (pyclustering@yandex.ru)
6 @date 2014-2020
7 @copyright BSD-3-Clause
8 
9 """
10 
11 
12 import random
13 
14 
15 def reservoir_r(data, n):
16  """!
17  @brief Performs data sampling using Reservoir Algorithm R.
18  @details Algorithm complexity O(n). Implementation is based on paper @cite article::utils::sampling::1. Average
19  number of uniform random variates: \f$N - n\f$.
20 
21  @param[in] data (list): Input data for sampling.
22  @param[in] n (uint): Size of sample that should be extracted from 'data'.
23 
24  @return (list) Sample with size 'n' from 'data'.
25 
26  Generate random samples with 5 elements and with 3 elements using Reservoir Algorithm R:
27  @code
28  data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
29  sample = reservoir_r(data, 5) # generate sample with 5 elements for 'data'.
30  print(sample)
31 
32  sample = reservoir_r(data, 3) # generate sample with 3 elements for 'data'.
33  print(sample)
34  @endcode
35 
36  Output example for the code above:
37  @code
38  [20, 7, 17, 12, 11]
39  [12, 2, 10]
40  @endcode
41 
42  """
43  if n > len(data):
44  raise ValueError("Incorrect sampling value 'n' (%d) that should be bigger then data size (%d).")
45 
46  random.seed()
47  reservoir = data[0:n]
48 
49  for i in range(n, len(data)):
50  m = random.randrange(0, i + 1)
51  if m < n:
52  reservoir[m] = data[i]
53 
54  return reservoir
55 
56 
57 def reservoir_x(data, n):
58  """!
59  @brief Performs data sampling using Reservoir Algorithm X.
60  @details Algorithm complexity O(n). Implementation is based on paper @cite article::utils::sampling::1. Average
61  number of uniform random variates:
62  \f[\approx 2n\ln \left (\frac{N}{n} \right)\f]
63 
64  @param[in] data (list): Input data for sampling.
65  @param[in] n (uint): Size of sample that should be extracted from 'data'.
66 
67  @return (list) Sample with size 'n' from 'data'.
68 
69  Generate random sample with 5 elements using Reservoir Algorithm X:
70  @code
71  data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
72  sample = reservoir_x(data, 10) # generate sample with 5 elements for 'data'.
73  print(sample)
74  @endcode
75 
76  Output example for the code above:
77  @code
78  [0, 20, 2, 16, 13, 15, 19, 18, 10, 9]
79  @endcode
80 
81  """
82  def calculate_skip_value(t, size, skip):
83  return pow(t + 1 - size, skip + 1) / pow(t + 1, skip + 1)
84 
85  def generate_skip_value(t, size):
86  threshold, skip = random.random(), 0
87  while calculate_skip_value(t, size, skip) > threshold:
88  skip += 1
89  return skip
90 
91  if n > len(data):
92  raise ValueError("Incorrect sampling value 'n' (%d) that should be bigger then data size (%d).")
93 
94  random.seed()
95  reservoir = data[0:n]
96 
97  i = n
98  while i < len(data):
99  i += generate_skip_value(i, n)
100 
101  if i < len(data):
102  m = random.randrange(0, n)
103  reservoir[m] = data[i]
104 
105  i += 1
106 
107  return reservoir
pyclustering.utils.sampling.reservoir_r
def reservoir_r(data, n)
Performs data sampling using Reservoir Algorithm R.
Definition: sampling.py:15
pyclustering.utils.sampling.reservoir_x
def reservoir_x(data, n)
Performs data sampling using Reservoir Algorithm X.
Definition: sampling.py:57