Merge pull request #75 from UW-Hydro/develop

Merge for initial release
UW-Hydro · Jul 2, 2021 · 6c3f393 · 6c3f393
2 parents 5226ca7 + ec09f32
commit 6c3f393
Show file tree

Hide file tree

Showing 14 changed files with 1,150 additions and 1,328 deletions.
diff --git a/bmorph/core/bmorph.py b/bmorph/core/bmorph.py
@@ -2,11 +2,11 @@
 bmorph: modify a time series by removing elements of persistent differences
         (aka bias correction)
 
-Persistent differences are inferred by comparing a 'truth' sample with a
+Persistent differences are inferred by comparing a 'ref' sample with a
 'training' sample. These differences are then used to correct a 'raw' sample
 that is presumed to have the same persistent differences as the 'training'
 sample. The resulting 'bmorph' sample should then be consistent with the
-'truth' sample.
+'ref' sample.
 """
 
 import numpy as np
@@ -25,7 +25,7 @@ def kde2D(x, y, xbins=200, ybins=10, **kwargs):
     xy_train  = np.vstack([y, x]).T
     kde = KernelDensity(**kwargs)
     kde.fit(xy_train)
-    # should this be exponential'd?
+    # Need to exponentiate to get true probabilities
     z = np.exp(kde.score_samples(xy_sample))
     return xx[:, 0], yy[0, :], np.reshape(z, yy.shape)
 
@@ -49,46 +49,55 @@ def marginalize_cdf(y_raw, z_raw, vals):
     return z
 
 
-def cqm(raw_x: pd.Series, train_x: pd.Series, truth_x: pd.Series,
-        raw_y: pd.Series, train_y: pd.Series, truth_y: pd.Series=None,
-        method='hist', xbins=200, ybins=10, bw=3, rtol=1e-7, atol=0) -> pd.Series:
+def cqm(raw_x: pd.Series, train_x: pd.Series, ref_x: pd.Series,
+        raw_y: pd.Series, train_y: pd.Series, ref_y: pd.Series=None,
+        method='hist', xbins=200, ybins=10, bw=3, rtol=1e-7, atol=0, nsmooth=5) -> pd.Series:
     """Conditional Quantile Mapping
 
     Multidimensional conditional equidistant CDF matching function:
     \tilde{x_{mp}} = x_{mp} + F^{-1}_{oc}(F_{mp}(x_{mp}|y_{mp})|y_{oc})
                             - F^{-1}_{mc}(F_{mp}(x_{mp}|y_{mp})|y_{mc})
     """
-
     if method == 'kde':
         x_raw, y_raw, z_raw = kde2D(raw_x, raw_y, xbins, ybins,
                                     bandwidth=bw, rtol=rtol, atol=atol)
         x_train, y_train, z_train = kde2D(train_x, train_y, xbins, ybins,
                                           bandwidth=bw, rtol=rtol, atol=atol)
-        x_truth, y_truth, z_truth = kde2D(truth_x, truth_y, xbins, ybins,
+        x_ref, y_ref, z_ref = kde2D(ref_x, ref_y, xbins, ybins,
                                           bandwidth=bw, rtol=rtol, atol=atol)
     elif method == 'hist':
         x_raw, y_raw, z_raw = hist2D(raw_x, raw_y, xbins, ybins)
         x_train, y_train, z_train = hist2D(train_x, train_y, xbins, ybins)
-        x_truth, y_truth, z_truth = hist2D(truth_x, truth_y, xbins, ybins)
+        x_ref, y_ref, z_ref = hist2D(ref_x, ref_y, xbins, ybins)
     else:
         raise Exception("Current methods for cqm only include 'hist' to use hist2D "
                         "and 'kde' to use kde2D, please select one.")
 
-    nx = np.arange(len(raw_x))
+    # Lookup tables of actual cdfs with dimensions (xbins, time)
     raw_cdfs = marginalize_cdf(y_raw, z_raw, raw_y)
-    u_t = raw_cdfs[np.argmin(np.abs(raw_x.values[:, np.newaxis] - x_raw), axis=1), nx]
+    # Lookup tables of actual cdfs with dimensions (time, xbins)
+    # These are transposed to do the inverse lookup
+    # Note we always marginalize with respect to the raw values
+    train_cdf = marginalize_cdf(y_train, z_train, raw_y).T
+    ref_cdf = marginalize_cdf(y_ref, z_ref, raw_y).T
+
+    # Quantiles of the raw flows
+    nx_raw = np.arange(len(raw_x))
+    u_t = raw_cdfs[np.argmin(np.abs(raw_x.values[:, np.newaxis] - x_raw), axis=1), nx_raw]
     u_t = u_t[:, np.newaxis]
 
-    train_cdf = marginalize_cdf(y_train, z_train, train_y).T
-    mapped_train = x_train[np.argmin(np.abs(u_t - train_cdf[nx, :]), axis=1)]
-
-    truth_cdfs = marginalize_cdf(y_truth, z_truth, truth_y).T
-    mapped_truth = x_truth[np.argmin(np.abs(u_t - truth_cdfs[nx, :]), axis=1)]
-
-    return pd.Series(mapped_truth / mapped_train, index=raw_x.index)
+    # Lookup the associated flow values in the CDFs for the reference and training periods
+    mapped_ref = x_ref[np.argmin(np.abs(u_t - ref_cdf[nx_raw, :]), axis=1)]
+    mapped_train = x_train[np.argmin(np.abs(u_t - train_cdf[nx_raw, :]), axis=1)]
+    mapped_train[mapped_train < 1e-6] = 1e-6
+    multipliers = pd.Series(mapped_ref / mapped_train, index=raw_x.index, name='multiplier')
+    if method == 'hist':
+        # Do some smoothing just to reduce effects of histogram bin edges
+        multipliers = multipliers.rolling(nsmooth, win_type='triang', min_periods=1).mean()
+    return multipliers
 
 
-def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
+def edcdfm(raw_x, raw_cdf, train_cdf, ref_cdf):
     '''Calculate  multipliers using an adapted version of the EDCDFm technique
 
     This routine implements part of the PresRat bias correction method from
@@ -99,12 +108,12 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
     multiplicative changes in the quantiles of a CDF.
 
     In particular, if the value `raw_x` falls at quantile `u_t` (in `raw_cdf`),
-    then the bias-corrected value is the value in `truth_cdf` at `u_t`
-    (`truth_x`) multiplied by the model-predicted change at `u_t` evaluated as
+    then the bias-corrected value is the value in `ref_cdf` at `u_t`
+    (`ref_x`) multiplied by the model-predicted change at `u_t` evaluated as
     a ratio (i.e., model future (or `raw_x`) / model historical (or
-    `truth_x`)). Thus, the bias-corrected value is `raw_x` multiplied by
-    `truth_x`/`train_x`. Here we only return the multiplier
-    `truth_x`/`train_x`. This method preserves the model-predicted median
+    `ref_x`)). Thus, the bias-corrected value is `raw_x` multiplied by
+    `ref_x`/`train_x`. Here we only return the multiplier
+    `ref_x`/`train_x`. This method preserves the model-predicted median
     (not mean) change evaluated multiplicatively. Additional corrections
     are required to preserve the mean change. Inclusion of these additional
     corrections constitutes the PresRat method.
@@ -118,9 +127,9 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
         determine the non-parametric quantile of `raw_x`
     train_cdf: pandas.Series
         Series of training values that represents the CDF based on
-        the same process as `raw_cdf`, but overlapping in time with `truth_cdf`
-    truth_cdf: pandas.Series
-         Series of truth values that represents the truth CDF and that
+        the same process as `raw_cdf`, but overlapping in time with `ref_cdf`
+    ref_cdf: pandas.Series
+         Series of ref values that represents the ref CDF and that
          overlaps in time with `train_cdf`
 
     Returns
@@ -134,7 +143,7 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
     assert isinstance(raw_x, pd.Series)
     assert isinstance(raw_cdf, pd.Series)
     assert isinstance(train_cdf, pd.Series)
-    assert isinstance(truth_cdf, pd.Series)
+    assert isinstance(ref_cdf, pd.Series)
 
     # Given raw_x and raw_cdf determine the quantiles u_t
     # This method is slightly more efficient than using
@@ -149,21 +158,19 @@ def edcdfm(raw_x, raw_cdf, train_cdf, truth_cdf):
     train_x = np.percentile(train_cdf, u_t)
     train_x[train_x < 1e-6] = 1e-6
 
-    # Given u_t and truth_cdf determine truth_x
-    truth_x = np.percentile(truth_cdf, u_t)
+    # Given u_t and ref_cdf determine ref_x
+    ref_x = np.percentile(ref_cdf, u_t)
 
     # Calculate multiplier
-    multiplier = truth_x / train_x
+    multiplier = ref_x / train_x
+    return pd.Series(multiplier, index=raw_x.index, name='multiplier')
 
-    return pd.Series(multiplier, index=raw_x.index)
 
-
-def bmorph(raw_ts, raw_cdf_window, raw_bmorph_window,
-           truth_ts, train_ts, training_window,
-           nsmooth, raw_y=None, truth_y=None, train_y=None,
-           bw=3, xbins=200, ybins=10, rtol=1e-7, atol=0,
-           method='hist'):
-    '''Morph `raw_ts` based on differences between `truth_ts` and `train_ts`
+def bmorph(raw_ts, train_ts, ref_ts,
+           raw_apply_window, raw_train_window, ref_train_window, raw_cdf_window,
+           raw_y=None, ref_y=None, train_y=None,
+           nsmooth=12, bw=3, xbins=200, ybins=10, rtol=1e-7, atol=0, method='hist'):
+    '''Morph `raw_ts` based on differences between `ref_ts` and `train_ts`
 
     bmorph is an adaptation of the PresRat bias correction procedure from
     Pierce et al. (2015; http://dx.doi.org/10.1175/JHM-D-14-0236.1), which is
@@ -189,21 +196,21 @@ def bmorph(raw_ts, raw_cdf_window, raw_bmorph_window,
         Slice used to determine the CDF for `raw_ts`
     raw_bmorph_window : slice
         Slice of `raw_ts` that will be bmorphed
-    truth_ts : pandas.Series
-        Target time series. This is the time series with truth values that
-        overlaps with `train_ts` and is used to calculated `truth_cdf`
+    ref_ts : pandas.Series
+        Target time series. This is the time series with ref values that
+        overlaps with `train_ts` and is used to calculated `ref_cdf`
     train_ts : pandas.Series
         Training time series. This time series is generated by the same process
-        as `raw_ts` but overlaps with `truth_ts`. It is used to calculate
+        as `raw_ts` but overlaps with `ref_ts`. It is used to calculate
         `train_cdf`
     training_window : slice
-        Slice used to subset `truth_ts` and `train_ts` when the mapping between
+        Slice used to subset `ref_ts` and `train_ts` when the mapping between
         them is created
     nsmooth : int
         Number of elements that will be smoothed when determining CDFs
     raw_y : pandas.Series
         Raw time series of the second time series variable for cqm
-    truth_y : pandas.Series
+    ref_y : pandas.Series
         Target second time series
     train_y : pandas.Series
         Training second time series
@@ -222,60 +229,61 @@ def bmorph(raw_ts, raw_cdf_window, raw_bmorph_window,
     '''
     # Type checking
     assert isinstance(raw_ts, pd.Series)
-    assert isinstance(raw_cdf_window, slice)
-    assert isinstance(raw_bmorph_window, slice)
-    assert isinstance(truth_ts, pd.Series)
+    assert isinstance(ref_ts, pd.Series)
     assert isinstance(train_ts, pd.Series)
-    assert isinstance(training_window, slice)
+
+    assert isinstance(raw_apply_window, slice)
+    assert isinstance(raw_train_window, slice)
+    assert isinstance(ref_train_window, slice)
+    assert isinstance(raw_cdf_window, slice)
 
     # Smooth the raw Series
     raw_smoothed_ts = raw_ts.rolling(window=nsmooth, min_periods=1, center=True).mean()
+
     # Create the CDFs that are used for morphing the raw_ts. The mapping is
     # based on the training_window
-    truth_cdf = truth_ts[training_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
-    train_cdf = train_ts[training_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
+    ref_cdf = ref_ts[ref_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
+    train_cdf = train_ts[raw_train_window].rolling(window=nsmooth, min_periods=1, center=True).mean()
+    raw_cdf = raw_smoothed_ts[raw_cdf_window]
 
     # Check if using edcdfm or cqm through second variable being added
-    # for the raw and train series because truth can be set as train later
-    if (raw_y is None) or (truth_y is None) or (train_y is None):
+    # for the raw and train series because ref can be set as train later
+    if (raw_y is None) or (ref_y is None) or (train_y is None):
         # Calculate the bmorph multipliers based on the smoothed time series and
         # PDFs
-        bmorph_multipliers = edcdfm(raw_smoothed_ts[raw_bmorph_window],
-                                    raw_smoothed_ts[raw_cdf_window],
-                                    train_cdf, truth_cdf)
+        bmorph_multipliers = edcdfm(raw_smoothed_ts[raw_apply_window],
+                                    raw_cdf, train_cdf, ref_cdf)
 
         # Apply the bmorph multipliers to the raw time series
-        bmorph_ts = bmorph_multipliers * raw_ts[raw_bmorph_window]
+        bmorph_ts = bmorph_multipliers * raw_ts[raw_apply_window]
 
     else:
         # Continue Type Checking additionally series
         assert isinstance(raw_y, pd.Series)
-        assert isinstance(truth_y, pd.Series)
+        assert isinstance(ref_y, pd.Series)
         assert isinstance(train_y, pd.Series)
 
         # smooth the y series as well
         raw_smoothed_y = raw_y.rolling(
             window=nsmooth, min_periods=1, center=True).mean()
 
-        truth_smoothed_y = truth_y[training_window].rolling(
+        ref_smoothed_y = ref_y[ref_train_window].rolling(
             window=nsmooth, min_periods=1, center=True).mean()
-        train_smoothed_y = train_y[training_window].rolling(
+        train_smoothed_y = train_y[raw_train_window].rolling(
             window=nsmooth, min_periods=1, center=True).mean()
 
-        bmorph_multipliers = cqm(raw_smoothed_ts[raw_bmorph_window],
-                                 train_cdf, truth_cdf,
-                                 raw_smoothed_y[raw_bmorph_window],
-                                 train_smoothed_y, truth_smoothed_y,
+        bmorph_multipliers = cqm(raw_cdf, train_cdf, ref_cdf,
+                                 raw_smoothed_y[raw_cdf_window],
+                                 train_smoothed_y, ref_smoothed_y,
                                  bw=bw, xbins=xbins, ybins=ybins,
                                  rtol=rtol, atol=atol, method=method)
-
-        bmorph_ts = bmorph_multipliers * raw_ts[raw_bmorph_window]
+        bmorph_ts = bmorph_multipliers[raw_apply_window] * raw_ts[raw_apply_window]
 
     return bmorph_ts, bmorph_multipliers
 
 
 def bmorph_correct(raw_ts, bmorph_ts, correction_window,
-                   truth_mean, train_mean, nsmooth):
+                   ref_mean, train_mean, nsmooth):
     '''Correct bmorphed values to preserve the ratio of change
 
     Apply a correction to bmorphed values to preserve the mean change over a
@@ -293,8 +301,8 @@ def bmorph_correct(raw_ts, bmorph_ts, correction_window,
         Series of bmorphed values
     correction_window : slice
         Slice of `raw_ts` and `bmorph_ts` over which the correction is applied
-    truth_mean : float
-        Mean of target time series (`truth_ts`) for the base period
+    ref_mean : float
+        Mean of target time series (`ref_ts`) for the base period
     train_mean : float
         Mean of training time series (`train_ts`) for the base period
     nsmooth : int
@@ -322,7 +330,7 @@ def bmorph_correct(raw_ts, bmorph_ts, correction_window,
     # Calculate the correction factors
     correction_ts = raw_ts_smoothed[correction_window] / \
         bmorph_ts_smoothed[correction_window] * \
-        truth_mean/train_mean
+        ref_mean/train_mean
 
     # Apply the correction to the bmorph time series
     bmorph_corrected_ts = correction_ts * bmorph_ts[correction_window]