Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patsy pickling #67

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions patsy/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
import ast
import numbers
import six
from six.moves import cPickle as pickle
from patsy import PatsyError
from patsy.util import PushbackAdapter, no_pickling, assert_no_pickling
from patsy.util import PushbackAdapter, no_pickling, assert_no_pickling, check_pickle_version
from patsy.tokens import (pretty_untokenize, normalize_token_spacing,
python_tokenize)
from patsy.compat import call_and_wrap_exc
Expand Down Expand Up @@ -565,7 +566,26 @@ def eval(self, memorize_state, data):
memorize_state,
data)

__getstate__ = no_pickling
def __getstate__(self):
return (0, self.code, self.origin)

def __setstate__(self, state):
(version, code, origin) = state
check_pickle_version(version, 0, self.__class__.__name__)
self.code = code
self.origin = origin

def test_EvalFactor_pickle_saves_origin():
# The pickling tests use object equality before and after pickling
# to test that pickling worked correctly. But EvalFactor's origin field
# is not used in equality comparisons, so we need a separate test to
# test that it is being pickled.
ORIGIN = 123456
f = EvalFactor('a', ORIGIN)
new_f = pickle.loads(pickle.dumps(f))

assert f.origin is not None
assert f.origin == new_f.origin

def test_EvalFactor_basics():
e = EvalFactor("a+b")
Expand All @@ -577,8 +597,6 @@ def test_EvalFactor_basics():
assert e.origin is None
assert e2.origin == "asdf"

assert_no_pickling(e)

def test_EvalFactor_memorize_passes_needed():
from patsy.state import stateful_transform
foo = stateful_transform(lambda: "FOO-OBJ")
Expand Down
24 changes: 24 additions & 0 deletions patsy/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Exhaustive end-to-end tests of the top-level API.

import sys
from six.moves import cPickle as pickle
import __future__
import six
import numpy as np
Expand Down Expand Up @@ -758,3 +759,26 @@ def test_C_and_pandas_categorical():
[[1, 0],
[1, 1],
[1, 0]])

def test_pickle_builder_roundtrips():
formulas = ["a + b",
"center(i) + a * b + np.log(x)"]
dataset = {"i": range(3),
"x": [1.1, 2.2, 3.3],
"a": list("abc"),
"b": list("xyx")}

for formula in formulas:
yield check_pickle_builder_roundtrips, formula, dataset

def check_pickle_builder_roundtrips(formula, dataset):
design_matrix = dmatrix(formula, dataset)
# TODO Make new_dataset have different values from dataset?
new_dataset = dataset

m1 = dmatrix(design_matrix.design_info, new_dataset)

unpickled_design_info = pickle.loads(pickle.dumps(design_matrix.design_info))
m2 = dmatrix(unpickled_design_info, new_dataset)

assert np.allclose(m1, m2)
76 changes: 76 additions & 0 deletions patsy/test_pickling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from __future__ import print_function

import six
from six.moves import cPickle as pickle

import os
import shutil

from patsy import EvalFactor

PICKE_TESTCASES_ROOTDIR = os.path.join(os.path.dirname(__file__), '..', 'pickle_testcases')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/PICKE/PICKLE/, I assume?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, hmm, so it looks like you're storing the pickle tests outside the source tree. Right now we go to some effort to ship the test suite, so that we can test patsy-as-installed... and I think the current CI stuff tries hard to run the test suite without access to the source tree, to make sure that this works. So this is definitely a change. Maybe it's a good change if the pickle test suite is large... but we'll need to do some more work in that case to make it really work. OTOH, maybe the pickle test suite is small, in which case we might as well ship it, because testing is important.

Probably the simplest thing to do is to move it into a patsy/test_data directory inside the package for now, and then we can always move it back out again if we decide that we have too many megabytes of test pickles. (This will probably require some annoying change to setup.py as well to include the test data -- example.)


pickling_testcases = {
"evalfactor_simple": EvalFactor("a+b"),
}

def test_pickling_same_version_roundtrips():
for obj in six.itervalues(pickling_testcases):
yield (check_pickling_same_version_roundtrips, obj)

def check_pickling_same_version_roundtrips(obj):
assert obj == pickle.loads(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL))

def test_pickling_old_versions_still_work():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd call this and the next function unpickling_... instead of pickling_...

for (dirpath, dirnames, filenames) in os.walk(PICKE_TESTCASES_ROOTDIR):
for fname in filenames:
if os.path.splitext(fname)[1] == '.pickle':
yield check_pickling_old_versions_still_work, os.path.join(dirpath, fname)

def check_pickling_old_versions_still_work(pickle_filename):
testcase_name = os.path.splitext(os.path.basename(pickle_filename))[0]
with open(pickle_filename, 'rb') as f:
# When adding features to a class, it will happen that there is no
# way to make an instance of that version version of that class
# equal to any instance of a previous version. How do we handle
# that?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: version version

And I think the answer here is probably that when we unpickle an old object into a new patsy, if the new patsy has some new features, then we'll have to somehow initialize them for the old object (like fill in zeros or empty strings or whatever), because we need some way to represent it in memory. So we can put in a test case here that uses those default field values, and also add a new object that actually uses the new fields in a non-trivial way.

# Maybe adding a minimum version requirement to each test?
assert pickling_testcases[testcase_name] == pickle.load(f)

def test_unpickling_future_gives_sensible_error_msg():
# TODO How would we go about testing this?
pass
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess just test that we get a PickleFromTheFuture exception, or whatever it is we decide to call that?


def create_pickles(version):
# TODO Add options to overwrite pickles directory, with force=True
# during development.
# TODO Add safety check that said force=True option will still give an
# error when trying to remove pickles for a released version, by
# comparing the version argument here with patsy.__version__.
pickle_testcases_dir = os.path.join(PICKE_TESTCASES_ROOTDIR, version)
if os.path.exists(pickle_testcases_dir):
raise OSError("{} already exists. Aborting.".format(pickle_testcases_dir))
pickle_testcases_tempdir = pickle_testcases_dir + "_inprogress"
os.mkdir(pickle_testcases_tempdir)

try:
for name, obj in six.iteritems(pickling_testcases):
with open(os.path.join(pickle_testcases_tempdir, "{}.pickle".format(name)), "wb") as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
except Exception:
print("Exception during creation of pickles for {}. " \
"Removing partially created directory.".format(version))
shutil.rmtree(pickle_testcases_tempdir)
raise
finally:
os.rename(pickle_testcases_tempdir, pickle_testcases_dir)
print("Successfully created pickle testcases for new version {}.".format(version))

if __name__ == "__main__":
import argparse

arg_parser = argparse.ArgumentParser(description="Create and save pickle testcases for a new version of patsy.")
arg_parser.add_argument("version", help="The version of patsy for which to save a new set of pickle testcases.")
args = arg_parser.parse_args()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might simplify the logic here if you know that our policy right now is that patsy version numbers are always of the form x.y.z+dev for development versions, and x.y.z only for released versions.


create_pickles(args.version)
22 changes: 21 additions & 1 deletion patsy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
]

import sys
from nose.tools import assert_raises
import numpy as np
import six
from six.moves import cStringIO as StringIO
Expand Down Expand Up @@ -698,7 +699,6 @@ def no_pickling(*args, **kwargs):

def assert_no_pickling(obj):
import pickle
from nose.tools import assert_raises
assert_raises(NotImplementedError, pickle.dumps, obj)

# Use like:
Expand All @@ -720,3 +720,23 @@ def test_safe_string_eq():
assert safe_string_eq(unicode("foo"), "foo")

assert not safe_string_eq(np.empty((2, 2)), "foo")

def check_pickle_version(version, required_version, name=""):
if version > required_version:
error_msg = "This version of patsy is too old to load this pickle"
elif version < required_version:
error_msg = "This pickle is too old and not supported by this version of patsy anymore"
else:
return
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presumably we'll eventually want to support multiple pickle versions at the same time -- but I guess we can always refactor this when the time comes :-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Specifically I think our goal will be to never stop supporting old pickles, so the "too old" error should never be needed... we'll see what happens though :-)


if name:
error_msg += " (for object {})".format(name)
error_msg += "."

# TODO Use a better exception than ValueError.
raise ValueError(error_msg)

def test_check_pickle_version():
assert_raises(ValueError, check_pickle_version, 0, 1)
assert_raises(ValueError, check_pickle_version, 1, 0)
check_pickle_version(0, 0)
Binary file added pickle_testcases/0.5/evalfactor_simple.pickle
Binary file not shown.
1 change: 1 addition & 0 deletions release-checklist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* verify that the ">97% coverage" claim in overview.rst is still true.
* cd docs; make clean html -- check that there are no warnings
* check MANIFEST.in
* run python -m patsy.test_pickling NEW_VERSION_NUMBER
* update version in doc/changes.rst, patsy/version.py
* make sure there are no uncommitted changes
* clone a clean source directory (so as to get a clean checkout
Expand Down