Storing numpy data in mongodb

Exploring different ways to store a numpy array in MongoDB. Conclusion: cPickle with protocol=2 is fastest.

In [1]:
import numpy as np
import cPickle
from pymongo import MongoClient
from bson.binary import Binary
# run `mongod` in another shell to enable this connection
conn = MongoClient()
collection = conn.test_database.random_arrays
In [2]:
# Using tolist()
collection.remove()
print("inserting with tolist()")
%timeit collection.insert({'tolist': np.random.rand(50,3).tolist()})
print("reading tolist()")
%timeit [np.array(x['tolist']) for x in collection.find()]
inserting with tolist()
1000 loops, best of 3: 248 us per loop
reading tolist()
1 loops, best of 3: 1.09 s per loop
In [3]:
# Using cPickle with default ASCII protocol.
collection.remove()
print("inserting with cpickle")
%timeit collection.insert({'cpickle': Binary(cPickle.dumps(np.random.rand(50,3)))})
print("reading cpickle")
%timeit -n 100 [cPickle.loads(x['cpickle']) for x in collection.find()]
inserting with cpickle
1000 loops, best of 3: 359 us per loop
reading cpickle
100 loops, best of 3: 245 ms per loop
In [4]:
# Using cPickle with fast protocol=2.
collection.remove()
print("inserting with cpickle protocol 2")
%timeit collection.insert({'cpickle': Binary(cPickle.dumps(np.random.rand(50,3), protocol=2))})
print("reading cpickle protocol 2")
%timeit -n 100 [cPickle.loads(x['cpickle']) for x in collection.find()]
inserting with cpickle protocol 2
1000 loops, best of 3: 208 us per loop
reading cpickle protocol 2
100 loops, best of 3: 97.3 ms per loop