1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18  """ 
 19  MLlib utilities for linear algebra. For dense vectors, MLlib 
 20  uses the NumPy C{array} type, so you can simply pass NumPy arrays 
 21  around. For sparse vectors, users can construct a L{SparseVector} 
 22  object from MLlib or pass SciPy C{scipy.sparse} column vectors if 
 23  SciPy is available in their environment. 
 24  """ 
 25   
 26  import numpy 
 27  from numpy import array, array_equal, ndarray, float64, int32 
 31   
 32      """ 
 33      A simple sparse vector class for passing data to MLlib. Users may 
 34      alternatively pass SciPy's {scipy.sparse} data types. 
 35      """ 
 36   
 38          """ 
 39          Create a sparse vector, using either a dictionary, a list of 
 40          (index, value) pairs, or two separate arrays of indices and 
 41          values (sorted by index). 
 42   
 43          @param size: Size of the vector. 
 44          @param args: Non-zero entries, as a dictionary, list of tupes, 
 45                 or two sorted lists containing indices and values. 
 46   
 47          >>> print SparseVector(4, {1: 1.0, 3: 5.5}) 
 48          (4,[1,3],[1.0,5.5]) 
 49          >>> print SparseVector(4, [(1, 1.0), (3, 5.5)]) 
 50          (4,[1,3],[1.0,5.5]) 
 51          >>> print SparseVector(4, [1, 3], [1.0, 5.5]) 
 52          (4,[1,3],[1.0,5.5]) 
 53          """ 
 54          self.size = int(size) 
 55          assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments" 
 56          if len(args) == 1: 
 57              pairs = args[0] 
 58              if type(pairs) == dict: 
 59                  pairs = pairs.items() 
 60              pairs = sorted(pairs) 
 61              self.indices = array([p[0] for p in pairs], dtype=int32) 
 62              self.values = array([p[1] for p in pairs], dtype=float64) 
 63          else: 
 64              assert len(args[0]) == len(args[1]), "index and value arrays not same length" 
 65              self.indices = array(args[0], dtype=int32) 
 66              self.values = array(args[1], dtype=float64) 
 67              for i in xrange(len(self.indices) - 1): 
 68                  if self.indices[i] >= self.indices[i + 1]: 
 69                      raise TypeError("indices array must be sorted") 
  70   
 71 -    def dot(self, other): 
  72          """ 
 73          Dot product with a SparseVector or 1- or 2-dimensional Numpy array. 
 74   
 75          >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 
 76          >>> a.dot(a) 
 77          25.0 
 78          >>> a.dot(array([1., 2., 3., 4.])) 
 79          22.0 
 80          >>> b = SparseVector(4, [2, 4], [1.0, 2.0]) 
 81          >>> a.dot(b) 
 82          0.0 
 83          >>> a.dot(array([[1, 1], [2, 2], [3, 3], [4, 4]])) 
 84          array([ 22.,  22.]) 
 85          """ 
 86          if type(other) == ndarray: 
 87              if other.ndim == 1: 
 88                  result = 0.0 
 89                  for i in xrange(len(self.indices)): 
 90                      result += self.values[i] * other[self.indices[i]] 
 91                  return result 
 92              elif other.ndim == 2: 
 93                  results = [self.dot(other[:, i]) for i in xrange(other.shape[1])] 
 94                  return array(results) 
 95              else: 
 96                  raise Exception("Cannot call dot with %d-dimensional array" % other.ndim) 
 97          else: 
 98              result = 0.0 
 99              i, j = 0, 0 
100              while i < len(self.indices) and j < len(other.indices): 
101                  if self.indices[i] == other.indices[j]: 
102                      result += self.values[i] * other.values[j] 
103                      i += 1 
104                      j += 1 
105                  elif self.indices[i] < other.indices[j]: 
106                      i += 1 
107                  else: 
108                      j += 1 
109              return result 
 110   
112          """ 
113          Squared distance from a SparseVector or 1-dimensional NumPy array. 
114   
115          >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 
116          >>> a.squared_distance(a) 
117          0.0 
118          >>> a.squared_distance(array([1., 2., 3., 4.])) 
119          11.0 
120          >>> b = SparseVector(4, [2, 4], [1.0, 2.0]) 
121          >>> a.squared_distance(b) 
122          30.0 
123          >>> b.squared_distance(a) 
124          30.0 
125          """ 
126          if type(other) == ndarray: 
127              if other.ndim == 1: 
128                  result = 0.0 
129                  j = 0    
130                  for i in xrange(other.shape[0]): 
131                      if j < len(self.indices) and self.indices[j] == i: 
132                          diff = self.values[j] - other[i] 
133                          result += diff * diff 
134                          j += 1 
135                      else: 
136                          result += other[i] * other[i] 
137                  return result 
138              else: 
139                  raise Exception("Cannot call squared_distance with %d-dimensional array" % 
140                                  other.ndim) 
141          else: 
142              result = 0.0 
143              i, j = 0, 0 
144              while i < len(self.indices) and j < len(other.indices): 
145                  if self.indices[i] == other.indices[j]: 
146                      diff = self.values[i] - other.values[j] 
147                      result += diff * diff 
148                      i += 1 
149                      j += 1 
150                  elif self.indices[i] < other.indices[j]: 
151                      result += self.values[i] * self.values[i] 
152                      i += 1 
153                  else: 
154                      result += other.values[j] * other.values[j] 
155                      j += 1 
156              while i < len(self.indices): 
157                  result += self.values[i] * self.values[i] 
158                  i += 1 
159              while j < len(other.indices): 
160                  result += other.values[j] * other.values[j] 
161                  j += 1 
162              return result 
 163   
165          """ 
166          Returns a copy of this SparseVector as a 1-dimensional NumPy array. 
167          """ 
168          arr = numpy.zeros(self.size) 
169          for i in xrange(self.indices.size): 
170              arr[self.indices[i]] = self.values[i] 
171          return arr 
 172   
174          inds = "[" + ",".join([str(i) for i in self.indices]) + "]" 
175          vals = "[" + ",".join([str(v) for v in self.values]) + "]" 
176          return "(" + ",".join((str(self.size), inds, vals)) + ")" 
 177   
179          inds = self.indices 
180          vals = self.values 
181          entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))]) 
182          return "SparseVector({0}, {{{1}}})".format(self.size, entries) 
 183   
185          """ 
186          Test SparseVectors for equality. 
187   
188          >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)]) 
189          >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) 
190          >>> v1 == v2 
191          True 
192          >>> v1 != v2 
193          False 
194          """ 
195   
196          return (isinstance(other, self.__class__) 
197                  and other.size == self.size 
198                  and array_equal(other.indices, self.indices) 
199                  and array_equal(other.values, self.values)) 
 200   
202          return not self.__eq__(other) 
  203   
206   
207      """ 
208      Factory methods for working with vectors. Note that dense vectors 
209      are simply represented as NumPy array objects, so there is no need 
210      to covert them for use in MLlib. For sparse vectors, the factory 
211      methods in this class create an MLlib-compatible type, or users 
212      can pass in SciPy's C{scipy.sparse} column vectors. 
213      """ 
214   
215      @staticmethod 
217          """ 
218          Create a sparse vector, using either a dictionary, a list of 
219          (index, value) pairs, or two separate arrays of indices and 
220          values (sorted by index). 
221   
222          @param size: Size of the vector. 
223          @param args: Non-zero entries, as a dictionary, list of tupes, 
224                       or two sorted lists containing indices and values. 
225   
226          >>> print Vectors.sparse(4, {1: 1.0, 3: 5.5}) 
227          (4,[1,3],[1.0,5.5]) 
228          >>> print Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) 
229          (4,[1,3],[1.0,5.5]) 
230          >>> print Vectors.sparse(4, [1, 3], [1.0, 5.5]) 
231          (4,[1,3],[1.0,5.5]) 
232          """ 
233          return SparseVector(size, *args) 
 234   
235      @staticmethod 
237          """ 
238          Create a dense vector of 64-bit floats from a Python list. Always 
239          returns a NumPy array. 
240   
241          >>> Vectors.dense([1, 2, 3]) 
242          array([ 1.,  2.,  3.]) 
243          """ 
244          return array(elements, dtype=float64) 
 245   
246      @staticmethod 
248          """ 
249          Converts a vector into a string, which can be recognized by 
250          Vectors.parse(). 
251   
252          >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0])) 
253          '(2,[1],[1.0])' 
254          >>> Vectors.stringify(Vectors.dense([0.0, 1.0])) 
255          '[0.0,1.0]' 
256          """ 
257          if type(vector) == SparseVector: 
258              return str(vector) 
259          else: 
260              return "[" + ",".join([str(v) for v in vector]) + "]" 
  261   
264      import doctest 
265      (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS) 
266      if failure_count: 
267          exit(-1) 
 268   
269  if __name__ == "__main__": 
270       
271       
272      import sys 
273      sys.path.pop(0) 
274      _test() 
275