Skip to content

Commit

Permalink
Merge pull request #75 from UW-Hydro/develop
Browse files Browse the repository at this point in the history
Merge for initial release
  • Loading branch information
arbennett committed Jul 2, 2021
2 parents 5226ca7 + ec09f32 commit 6c3f393
Show file tree
Hide file tree
Showing 14 changed files with 1,150 additions and 1,328 deletions.
148 changes: 78 additions & 70 deletions bmorph/core/bmorph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
bmorph: modify a time series by removing elements of persistent differences
(aka bias correction)
Persistent differences are inferred by comparing a 'truth' sample with a
Persistent differences are inferred by comparing a 'ref' sample with a
'training' sample. These differences are then used to correct a 'raw' sample
that is presumed to have the same persistent differences as the 'training'
sample. The resulting 'bmorph' sample should then be consistent with the
'truth' sample.
'ref' sample.
"""

import numpy as np
Expand All @@ -25,7 +25,7 @@ def kde2D(x, y, xbins=200, ybins=10, **kwargs):
xy_train = np.vstack([y, x]).T
kde = KernelDensity(**kwargs)
kde.fit(xy_train)
# should this be exponential'd?
# Need to exponentiate to get true probabilities
z = np.exp(kde.score_samples(xy_sample))
return xx[:, 0], yy[0, :], np.reshape(z, yy.shape)

Expand All @@ -49,46 +49,55 @@ def marginalize_cdf(y_raw, z_raw, vals):
return z


def cqm(raw_x: pd.Series, train_x: pd.Series, truth_x: pd.Series,
raw_y: pd.Series, train_y: pd.Series, truth_y: pd.Series=None,
method='hist', xbins=200, ybins=10, bw=3, rtol=1e-7, atol=0) -> pd.Series:
def cqm(raw_x: pd.Series, train_x: pd.Series, ref_x: pd.Series,
raw_y: pd.Series, train_y: pd.Series, ref_y: pd.Series=None,
method='hist', xbins=200, ybins=10, bw=3, rtol=1e-7, atol=0, nsmooth=5) -> pd.Series:
"""Conditional Quantile Mapping
Multidimensional conditional equidistant CDF matching function:
\tilde{x_{mp}} = x_{mp} + F^{-1}_{oc}(F_{mp}(x_{mp}|y_{mp})|y_{oc})
- F^{-1}_{mc}(F_{mp}(x_{mp}|y_{mp})|y_{mc})
"""

if method == 'kde':
x_raw, y_raw, z_raw = kde2D(raw_x, raw_y, xbins, ybins,
bandwidth=bw, rtol=rtol, atol=atol)
x_train, y_train, z_train = kde2D(train_x, train_y, xbins, ybins,
bandwidth=bw, rtol=rtol, atol=atol)
x_truth, y_truth, z_truth = kde2D(truth_x, truth_y, xbins, ybins,
x_ref, y_ref, z_ref = kde2D(ref_x, ref_y, xbins, ybins,
bandwidth=bw, rtol=rtol, atol=atol)
elif method == 'hist':
x_raw, y_raw, z_raw = hist2D(raw_x, raw_y, xbins, ybins)
x_train, y_train, z_train = hist2D(train_x, train_y, xbins, ybins)
x_truth, y_truth, z_truth = hist2D(truth_x, truth_y, xbins, ybins)
x_ref, y_ref, z_ref = hist2D(ref_x, ref_y, xbins, ybins)
else:
raise Exception("Current methods for cqm only include 'hist' to use hist2D "
"and 'kde' to use kde2D, please select one.")

nx = np.arange(len(raw_x))
# Lookup tables of actual cdfs with dimensions (xbins, time)
raw_cdfs = marginalize_cdf(y_raw, z_raw, raw_y)
u_t = raw_cdfs[np.argmin(np.abs(raw_x.values[:, np.newaxis] - x_raw), axis=1), nx]
# Lookup tables of actual cdfs with dimensions (time, xbins)
# These are transposed to do the inverse lookup
# Note we always marginalize with respect to the raw values
train_cdf = marginalize_cdf(y_train, z_train, raw_y).T
ref_cdf = marginalize_cdf(y_ref, z_ref, raw_y).T

# Quantiles of the raw flows
nx_raw = np.arange(len(raw_x))
u_t = raw_cdfs[np.argmin(np.abs(raw_x.values[:, np.newaxis] - x_raw), axis=1), nx_raw]
u_t = u_t[:, np.newaxis]

train_cdf = marginalize_cdf(y_train, z_train, train_y).T
mapped_train = x_train[np.argmin(np.abs(u_t - train_cdf[nx, :]), axis=1)]

truth_cdfs = marginalize_cdf(y_truth, z_truth, truth_y).T
mapped_truth = x_truth[np.argmin(np.abs(u_t - truth_cdfs[nx, :]), axis=1)]

return pd.Series(mapped_truth / mapped_train, index=raw_x.index)
# Lookup the associated flow values in the CDFs for the reference and training periods
mapped_ref = x_ref[np.argmin(np.abs(u_t - ref_cdf[nx_raw, :]), axis=1)]
mapped_train = x_train[np.argmin(np.abs(u_t - train_cdf[nx_raw, :]), axis=1)]
mapped_train[mapped_train < 1e-6] = 1e-6
multipliers = pd.Series(mapped_ref / mapped_train, index=raw_x.index, name='multiplier')
if method == 'hist':
# Do some smoothing just to reduce effects of histogram bin edges
multipliers = multipliers.rolling(nsmooth, win_type='triang', min_periods=1).mean()
return multipliers


def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
def edcdfm(raw_x, raw_cdf, train_cdf, ref_cdf):
'''Calculate multipliers using an adapted version of the EDCDFm technique
This routine implements part of the PresRat bias correction method from
Expand All @@ -99,12 +108,12 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
multiplicative changes in the quantiles of a CDF.
In particular, if the value `raw_x` falls at quantile `u_t` (in `raw_cdf`),
then the bias-corrected value is the value in `truth_cdf` at `u_t`
(`truth_x`) multiplied by the model-predicted change at `u_t` evaluated as
then the bias-corrected value is the value in `ref_cdf` at `u_t`
(`ref_x`) multiplied by the model-predicted change at `u_t` evaluated as
a ratio (i.e., model future (or `raw_x`) / model historical (or
`truth_x`)). Thus, the bias-corrected value is `raw_x` multiplied by
`truth_x`/`train_x`. Here we only return the multiplier
`truth_x`/`train_x`. This method preserves the model-predicted median
`ref_x`)). Thus, the bias-corrected value is `raw_x` multiplied by
`ref_x`/`train_x`. Here we only return the multiplier
`ref_x`/`train_x`. This method preserves the model-predicted median
(not mean) change evaluated multiplicatively. Additional corrections
are required to preserve the mean change. Inclusion of these additional
corrections constitutes the PresRat method.
Expand All @@ -118,9 +127,9 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
determine the non-parametric quantile of `raw_x`
train_cdf: pandas.Series
Series of training values that represents the CDF based on
the same process as `raw_cdf`, but overlapping in time with `truth_cdf`
truth_cdf: pandas.Series
Series of truth values that represents the truth CDF and that
the same process as `raw_cdf`, but overlapping in time with `ref_cdf`
ref_cdf: pandas.Series
Series of ref values that represents the ref CDF and that
overlaps in time with `train_cdf`
Returns
Expand All @@ -134,7 +143,7 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
assert isinstance(raw_x, pd.Series)
assert isinstance(raw_cdf, pd.Series)
assert isinstance(train_cdf, pd.Series)
assert isinstance(truth_cdf, pd.Series)
assert isinstance(ref_cdf, pd.Series)

# Given raw_x and raw_cdf determine the quantiles u_t
# This method is slightly more efficient than using
Expand All @@ -149,21 +158,19 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
train_x = np.percentile(train_cdf, u_t)
train_x[train_x < 1e-6] = 1e-6

# Given u_t and truth_cdf determine truth_x
truth_x = np.percentile(truth_cdf, u_t)
# Given u_t and ref_cdf determine ref_x
ref_x = np.percentile(ref_cdf, u_t)

# Calculate multiplier
multiplier = truth_x / train_x
multiplier = ref_x / train_x
return pd.Series(multiplier, index=raw_x.index, name='multiplier')

return pd.Series(multiplier, index=raw_x.index)


def bmorph(raw_ts, raw_cdf_window, raw_bmorph_window,
truth_ts, train_ts, training_window,
nsmooth, raw_y=None, truth_y=None, train_y=None,
bw=3, xbins=200, ybins=10, rtol=1e-7, atol=0,
method='hist'):
'''Morph `raw_ts` based on differences between `truth_ts` and `train_ts`
def bmorph(raw_ts, train_ts, ref_ts,
raw_apply_window, raw_train_window, ref_train_window, raw_cdf_window,
raw_y=None, ref_y=None, train_y=None,
nsmooth=12, bw=3, xbins=200, ybins=10, rtol=1e-7, atol=0, method='hist'):
'''Morph `raw_ts` based on differences between `ref_ts` and `train_ts`
bmorph is an adaptation of the PresRat bias correction procedure from
Pierce et al. (2015; http://dx.doi.org/10.1175/JHM-D-14-0236.1), which is
Expand All @@ -189,21 +196,21 @@ def bmorph(raw_ts, raw_cdf_window, raw_bmorph_window,
Slice used to determine the CDF for `raw_ts`
raw_bmorph_window : slice
Slice of `raw_ts` that will be bmorphed
truth_ts : pandas.Series
Target time series. This is the time series with truth values that
overlaps with `train_ts` and is used to calculated `truth_cdf`
ref_ts : pandas.Series
Target time series. This is the time series with ref values that
overlaps with `train_ts` and is used to calculated `ref_cdf`
train_ts : pandas.Series
Training time series. This time series is generated by the same process
as `raw_ts` but overlaps with `truth_ts`. It is used to calculate
as `raw_ts` but overlaps with `ref_ts`. It is used to calculate
`train_cdf`
training_window : slice
Slice used to subset `truth_ts` and `train_ts` when the mapping between
Slice used to subset `ref_ts` and `train_ts` when the mapping between
them is created
nsmooth : int
Number of elements that will be smoothed when determining CDFs
raw_y : pandas.Series
Raw time series of the second time series variable for cqm
truth_y : pandas.Series
ref_y : pandas.Series
Target second time series
train_y : pandas.Series
Training second time series
Expand All @@ -222,60 +229,61 @@ def bmorph(raw_ts, raw_cdf_window, raw_bmorph_window,
'''
# Type checking
assert isinstance(raw_ts, pd.Series)
assert isinstance(raw_cdf_window, slice)
assert isinstance(raw_bmorph_window, slice)
assert isinstance(truth_ts, pd.Series)
assert isinstance(ref_ts, pd.Series)
assert isinstance(train_ts, pd.Series)
assert isinstance(training_window, slice)

assert isinstance(raw_apply_window, slice)
assert isinstance(raw_train_window, slice)
assert isinstance(ref_train_window, slice)
assert isinstance(raw_cdf_window, slice)

# Smooth the raw Series
raw_smoothed_ts = raw_ts.rolling(window=nsmooth, min_periods=1, center=True).mean()

# Create the CDFs that are used for morphing the raw_ts. The mapping is
# based on the training_window
truth_cdf = truth_ts[training_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
train_cdf = train_ts[training_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
ref_cdf = ref_ts[ref_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
train_cdf = train_ts[raw_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
raw_cdf = raw_smoothed_ts[raw_cdf_window]

# Check if using edcdfm or cqm through second variable being added
# for the raw and train series because truth can be set as train later
if (raw_y is None) or (truth_y is None) or (train_y is None):
# for the raw and train series because ref can be set as train later
if (raw_y is None) or (ref_y is None) or (train_y is None):
# Calculate the bmorph multipliers based on the smoothed time series and
# PDFs
bmorph_multipliers = edcdfm(raw_smoothed_ts[raw_bmorph_window],
raw_smoothed_ts[raw_cdf_window],
train_cdf, truth_cdf)
bmorph_multipliers = edcdfm(raw_smoothed_ts[raw_apply_window],
raw_cdf, train_cdf, ref_cdf)

# Apply the bmorph multipliers to the raw time series
bmorph_ts = bmorph_multipliers * raw_ts[raw_bmorph_window]
bmorph_ts = bmorph_multipliers * raw_ts[raw_apply_window]

else:
# Continue Type Checking additionally series
assert isinstance(raw_y, pd.Series)
assert isinstance(truth_y, pd.Series)
assert isinstance(ref_y, pd.Series)
assert isinstance(train_y, pd.Series)

# smooth the y series as well
raw_smoothed_y = raw_y.rolling(
window=nsmooth, min_periods=1, center=True).mean()

truth_smoothed_y = truth_y[training_window].rolling(
ref_smoothed_y = ref_y[ref_train_window].rolling(
window=nsmooth, min_periods=1, center=True).mean()
train_smoothed_y = train_y[training_window].rolling(
train_smoothed_y = train_y[raw_train_window].rolling(
window=nsmooth, min_periods=1, center=True).mean()

bmorph_multipliers = cqm(raw_smoothed_ts[raw_bmorph_window],
train_cdf, truth_cdf,
raw_smoothed_y[raw_bmorph_window],
train_smoothed_y, truth_smoothed_y,
bmorph_multipliers = cqm(raw_cdf, train_cdf, ref_cdf,
raw_smoothed_y[raw_cdf_window],
train_smoothed_y, ref_smoothed_y,
bw=bw, xbins=xbins, ybins=ybins,
rtol=rtol, atol=atol, method=method)

bmorph_ts = bmorph_multipliers * raw_ts[raw_bmorph_window]
bmorph_ts = bmorph_multipliers[raw_apply_window] * raw_ts[raw_apply_window]

return bmorph_ts, bmorph_multipliers


def bmorph_correct(raw_ts, bmorph_ts, correction_window,
truth_mean, train_mean, nsmooth):
ref_mean, train_mean, nsmooth):
'''Correct bmorphed values to preserve the ratio of change
Apply a correction to bmorphed values to preserve the mean change over a
Expand All @@ -293,8 +301,8 @@ def bmorph_correct(raw_ts, bmorph_ts, correction_window,
Series of bmorphed values
correction_window : slice
Slice of `raw_ts` and `bmorph_ts` over which the correction is applied
truth_mean : float
Mean of target time series (`truth_ts`) for the base period
ref_mean : float
Mean of target time series (`ref_ts`) for the base period
train_mean : float
Mean of training time series (`train_ts`) for the base period
nsmooth : int
Expand Down Expand Up @@ -322,7 +330,7 @@ def bmorph_correct(raw_ts, bmorph_ts, correction_window,
# Calculate the correction factors
correction_ts = raw_ts_smoothed[correction_window] / \
bmorph_ts_smoothed[correction_window] * \
truth_mean/train_mean
ref_mean/train_mean

# Apply the correction to the bmorph time series
bmorph_corrected_ts = correction_ts * bmorph_ts[correction_window]
Expand Down
Loading

0 comments on commit 6c3f393

Please sign in to comment.