Mailing List Archive

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Pull make_cv_objective outside tuner
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/406067 )

Change subject: Pull make_cv_objective outside tuner
......................................................................

Pull make_cv_objective outside tuner

This really had no business in tuner, it's function is
independant and it didn't require any of the state. Adds
a test that verifies the function works roughly as expected.

Also drop the 'condition' argument from tuner stages. A standard
if condition should be used when building the stage list.

Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a
---
M mjolnir/test/training/test_tuning.py
M mjolnir/training/tuning.py
M mjolnir/training/xgboost.py
M mjolnir/utils.py
4 files changed, 72 insertions(+), 63 deletions(-)


git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/67/406067/1

diff --git a/mjolnir/test/training/test_tuning.py b/mjolnir/test/training/test_tuning.py
index 22402f1..15389d7 100644
--- a/mjolnir/test/training/test_tuning.py
+++ b/mjolnir/test/training/test_tuning.py
@@ -46,7 +46,7 @@
}

tuner = mjolnir.training.tuning.ModelSelection(initial_space, tune_stages)
- train_func = tuner.make_cv_objective(f, folds, num_cv_jobs, **kwargs)
+ train_func = mjolnir.training.tuning.make_cv_objective(f, folds, num_cv_jobs, **kwargs)
trials_pool = tuner.build_pool(folds, num_cv_jobs)
result = tuner(train_func, trials_pool)
return result, stats['called']
@@ -80,39 +80,14 @@
assert result['params']['baz'] == 0


-def test_ModelSelection_stage_condition():
- num_iterations = 3
- result, called = run_model_selection([.
- ('a', {
- 'condition': lambda: False,
- 'iterations': num_iterations,
- 'space': {
- 'foo': hyperopt.hp.uniform('foo', 1, 9),
- }
- }),
- ('b', {
- 'iterations': num_iterations,
- 'space': {
- 'bar': hyperopt.hp.uniform('bar', 1, 9),
- }
- }),
- ])
- # iterations * folds
- assert called == num_iterations * 2
- assert result['params']['foo'] == 10
- assert 1 <= result['params']['bar'] <= 9
- assert result['params']['baz'] == 0
-
-
def test_ModelSelection_kwargs_pass_thru():
- tuner = mjolnir.training.tuning.ModelSelection(None, None)
expected_kwargs = {'hi': 5, 'there': 'test'}

def f(fold, params, **kwargs):
assert kwargs == expected_kwargs
return {'test': [fold[0]], 'train': [fold[0]]}

- obj = tuner.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs)
+ obj = mjolnir.training.tuning.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs)

res = obj(None)
assert res == [
@@ -144,3 +119,23 @@
folds = [[1] * num_workers for i in range(num_folds)]
pool = tuner.build_pool(folds, num_cv_jobs)
assert (pool is not None) == expect_pool
+
+
+def test_ModelSelection_transformer():
+ stats = {'called': 0}
+
+ def transformer(result, params):
+ assert 'foo' in result
+ assert result['foo'] == 'bar'
+ assert params == 'some params'
+ stats['called'] += 1
+ return 'baz'
+
+ def f(fold, params):
+ assert params == 'some params'
+ return {'foo': 'bar'}
+
+ folds = [[1, 2, 3], [4, 5, 6]]
+ obj = mjolnir.training.tuning.make_cv_objective(f, folds, 1, transformer)
+ assert obj('some params') == ['baz', 'baz']
+ assert stats['called'] == 2
diff --git a/mjolnir/training/tuning.py b/mjolnir/training/tuning.py
index 7d2df68..81bfafe 100644
--- a/mjolnir/training/tuning.py
+++ b/mjolnir/training/tuning.py
@@ -133,11 +133,48 @@
return with_retry


+def make_cv_objective(train_func, folds, num_cv_jobs, transformer=None, **kwargs):
+ """Create a cross-validation objective function
+
+ Parameters
+ ----------
+ train_func : callable
+ Function accepting a fold and hyperparameters to perform training
+ num_cv_jobs : int
+ The total number of folds to train in parallel
+ transformer : callable or None, optional
+ Function accepting output of train_func and hyperparameters to
+ return stats about the individual fold train/test performance
+
+ Returns
+ -------
+ callable
+ Accepts a set of hyperparameters as only argument and returns
+ list of per-fold train/test performance.
+ """
+ train_func = _py4j_retry(train_func, None)
+ if num_cv_jobs > 1:
+ cv_pool = Pool(num_cv_jobs)
+ cv_mapper = cv_pool.map
+ else:
+ cv_mapper = map
+
+ def f(params):
+ def inner(fold):
+ return train_func(fold, params, **kwargs)
+
+ return cv_mapper(inner, folds)
+
+ if transformer is None:
+ return f
+ else:
+ return lambda params: [transformer(scores, params) for scores in f(params)]
+
+
class ModelSelection(object):
- def __init__(self, initial_space, tune_stages, transformer=None):
+ def __init__(self, initial_space, tune_stages):
self.initial_space = initial_space
self.tune_stages = tune_stages
- self.transformer = transformer

def build_pool(self, folds, num_cv_jobs):
num_folds = len(folds)
@@ -148,31 +185,7 @@
else:
return None

- def make_cv_objective(self, train_func, folds, num_cv_jobs, **kwargs):
- train_func = _py4j_retry(train_func, None)
- if num_cv_jobs > 1:
- cv_pool = Pool(num_cv_jobs)
- cv_mapper = cv_pool.map
- else:
- cv_mapper = map
-
- def f(params):
- def inner(fold):
- return train_func(fold, params, **kwargs)
-
- return cv_mapper(inner, folds)
-
- if not self.transformer:
- return f
-
- def g(params):
- return [self.transformer(scores, params) for scores in f(params)]
-
- return g
-
def eval_stage(self, train_func, stage, space, pool):
- if 'condition' in stage and not stage['condition']():
- return space, None
# Override current space with new space
merged = dict(space, **stage['space'])
best, trials = mjolnir.training.hyperopt.maximize(
@@ -190,8 +203,7 @@
stages = []
for stage_name, stage in self.tune_stages:
space, trials = self.eval_stage(train_func, stage, space, pool)
- if trials is not None:
- stages.append((stage_name, trials))
+ stages.append((stage_name, trials))

trials_final = stages[-1][1]
best_trial = np.argmin(trials_final.losses())
diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py
index 1a23a66..abeaabf 100644
--- a/mjolnir/training/xgboost.py
+++ b/mjolnir/training/xgboost.py
@@ -2,7 +2,7 @@
import hyperopt
import mjolnir.spark
import mjolnir.training.hyperopt
-from mjolnir.training.tuning import ModelSelection
+from mjolnir.training.tuning import make_cv_objective, ModelSelection
import numpy as np
import pyspark
import pyspark.sql
@@ -410,16 +410,17 @@
'colsample_bytree': hyperopt.hp.quniform('colsample_bytree', 0.8, 1, .01),
}
}[dataset_size]
- }),
- ('trees', {
+ })
+ ]
+
+ if final_num_trees is not None and final_num_trees != initial_num_trees:
+ tune_spaces.append(('trees', {
'iterations': 30,
- 'condition': lambda: final_num_trees is not None and final_num_trees != initial_num_trees,
'space': {
'num_rounds': final_num_trees,
'eta': hyperopt.hp.uniform('eta', 0.1, 0.4),
}
- })
- ]
+ }))

# Baseline parameters to start with. Roughly tuned by what has worked in
# the past. These vary though depending on number of training samples. These
@@ -441,7 +442,7 @@
'colsample_bytree': 0.8,
}

- tuner = ModelSelection(space, tune_spaces, cv_transformer)
- train_func = tuner.make_cv_objective(train, folds, num_cv_jobs, train_matrix=train_matrix)
+ tuner = ModelSelection(space, tune_spaces)
+ train_func = make_cv_objective(train, folds, num_cv_jobs, cv_transformer, train_matrix=train_matrix)
trials_pool = tuner.build_pool(folds, num_cv_jobs)
return tuner(train_func, trials_pool)
diff --git a/mjolnir/utils.py b/mjolnir/utils.py
index 7ac6422..f64f2bd 100644
--- a/mjolnir/utils.py
+++ b/mjolnir/utils.py
@@ -63,6 +63,7 @@
else:
# TODO: Untested
with tempfile.NamedTemporaryFile() as local:
+ os.unlink(local.name)
subprocess.check_call(['hdfs', 'dfs', '-copyToLocal', path, local.name])
if with_query:
try:

--
To view, visit https://gerrit.wikimedia.org/r/406067
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhardson@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits