sampling.py
1 """!
2 
3 @brief Module provides various random sampling algorithms.
4 
5 @authors Andrei Novikov (pyclustering@yandex.ru)
6 @date 2014-2020
7 @copyright GNU Public License
8 
9 @cond GNU_PUBLIC_LICENSE
10  PyClustering is free software: you can redistribute it and/or modify
11  it under the terms of the GNU General Public License as published by
12  the Free Software Foundation, either version 3 of the License, or
13  (at your option) any later version.
14 
15  PyClustering is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  GNU General Public License for more details.
19 
20  You should have received a copy of the GNU General Public License
21  along with this program. If not, see <http://www.gnu.org/licenses/>.
22 @endcond
23 
24 """
25 
26 
27 import random
28 
29 
30 def reservoir_r(data, n):
31  """!
32  @brief Performs data sampling using Reservoir Algorithm R.
33  @details Algorithm complexity O(n). Implementation is based on paper @cite article::utils::sampling::1. Average
34  number of uniform random variates: \f$N - n\f$.
35 
36  @param[in] data (list): Input data for sampling.
37  @param[in] n (uint): Size of sample that should be extracted from 'data'.
38 
39  @return (list) Sample with size 'n' from 'data'.
40 
41  Generate random samples with 5 elements and with 3 elements using Reservoir Algorithm R:
42  @code
43  data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
44  sample = reservoir_r(data, 5) # generate sample with 5 elements for 'data'.
45  print(sample)
46 
47  sample = reservoir_r(data, 3) # generate sample with 3 elements for 'data'.
48  print(sample)
49  @endcode
50 
51  Output example for the code above:
52  @code
53  [20, 7, 17, 12, 11]
54  [12, 2, 10]
55  @endcode
56 
57  """
58  if n > len(data):
59  raise ValueError("Incorrect sampling value 'n' (%d) that should be bigger then data size (%d).")
60 
61  random.seed()
62  reservoir = data[0:n]
63 
64  for i in range(n, len(data)):
65  m = random.randrange(0, i + 1)
66  if m < n:
67  reservoir[m] = data[i]
68 
69  return reservoir
70 
71 
72 def reservoir_x(data, n):
73  """!
74  @brief Performs data sampling using Reservoir Algorithm X.
75  @details Algorithm complexity O(n). Implementation is based on paper @cite article::utils::sampling::1. Average
76  number of uniform random variates:
77  \f[\approx 2n\ln \left (\frac{N}{n} \right)\f]
78 
79  @param[in] data (list): Input data for sampling.
80  @param[in] n (uint): Size of sample that should be extracted from 'data'.
81 
82  @return (list) Sample with size 'n' from 'data'.
83 
84  Generate random sample with 5 elements using Reservoir Algorithm X:
85  @code
86  data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
87  sample = reservoir_x(data, 10) # generate sample with 5 elements for 'data'.
88  print(sample)
89  @endcode
90 
91  Output example for the code above:
92  @code
93  [0, 20, 2, 16, 13, 15, 19, 18, 10, 9]
94  @endcode
95 
96  """
97  def calculate_skip_value(t, size, skip):
98  return pow(t + 1 - size, skip + 1) / pow(t + 1, skip + 1)
99 
100  def generate_skip_value(t, size):
101  threshold, skip = random.random(), 0
102  while calculate_skip_value(t, size, skip) > threshold:
103  skip += 1
104  return skip
105 
106  if n > len(data):
107  raise ValueError("Incorrect sampling value 'n' (%d) that should be bigger then data size (%d).")
108 
109  random.seed()
110  reservoir = data[0:n]
111 
112  i = n
113  while i < len(data):
114  i += generate_skip_value(i, n)
115 
116  if i < len(data):
117  m = random.randrange(0, n)
118  reservoir[m] = data[i]
119 
120  i += 1
121 
122  return reservoir
def reservoir_r(data, n)
Performs data sampling using Reservoir Algorithm R.
Definition: sampling.py:30
def reservoir_x(data, n)
Performs data sampling using Reservoir Algorithm X.
Definition: sampling.py:72