Source code for pyamr.core.regression.wls

##############################################################################
# Author: Bernard Hernandez
# Filename: wls.py
#
# Description : This file contains a wrapper for the weighted least squares
#               prediction model implemented in statsmodels.
#
###############################################################################
# Import future
from __future__ import division

# Libraries
import sys
import copy
import warnings
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Import specific
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# Libraries wrapper.
from pyamr.core.regression.wreg import RegressionWrapper
from pyamr.core.stats.wbase import fargs


# --------------------------------------------------------------------------
#                              helper methods
# -------------------------------------------------------------------------- 
def _cast_float(e):
    """Casts an element to float when feasible.
    """
    try:
        return float(e)
    except:
        return e


[docs]class PredictionResult: def __init__(self, mean, cilo, ciup, pstd, pilo, piup, time, nobs, endog): """The constructor Parameters ---------- Returns ------- """ # Set variables self.mean = mean self.cilo = cilo self.ciup = ciup self.pstd = pstd self.pilo = pilo self.piup = piup self.time = time self.nobs = nobs self.endog = endog # Set combined interval self.bilo, self.biup = np.copy(cilo), np.copy(ciup) self.bilo[self.nobs:] = self.pilo[self.nobs:] self.biup[self.nobs:] = self.piup[self.nobs:]
[docs]class WLSWrapper(RegressionWrapper): """The description... """ # Attributes. _name = 'WLS' # Label to add to the attributes when saving. # -------------------------------------------------------------------------- # helper methods # -------------------------------------------------------------------------- def _identifier(self, **kwargs): """This methods describes de model.""" # Returns description if self.W is not None: if hasattr(self.W, '_identifier'): return "%s(%s,%s)" % (self._name, self.trend, self.W._identifier(**kwargs)) else: return "%s(%s,%s)" % (self._name, self.trend, self.W.__class__.__name__) # Return basic return "%s(%s)" % (self._name, self.trend)
[docs] def pred_int(self, start=None, end=None, **kwargs): """This method computes the prediction intervals Parameters ---------- start : int (optional) The time t to start the prediction end : int (optional) The time t to end the prediction Returns ------- the standard prediction error and the prediction intervals """ # Create the exogenous variable for prediction exog = self._exog(start, end) # Compute pstd, pilo, piup = wls_prediction_std(self._raw, exog=exog, **kwargs) # Retrn return np.column_stack((pilo, piup))
# -------------------------------------------------------------------------- # set variables # -------------------------------------------------------------------------- def _params_from_summary(self): """Gets parameters from the summary result of the raw object. .. note: There must be a way to get all the elements extracted from the summary using the raw object and the methods implemented within. See: https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.RegressionResults.html """ # Format summary summary = self._raw.summary().as_csv() summary = summary.split("\n", 1)[1] # Remove first line. summary = summary.replace("\n", ",") # Replace \n by comma. # Split in elements. elements = summary.split(",") elements = [_cast_float(e.strip()) for e in elements] #print(self._raw.summary()) #for tb in self._raw.summary().tables: # b = pd.read_html(tb.as_html(), header=0, index_col=0)[0] # print(b) # print("\n") #import sys #sys.exit() # Create series. d = {} # Add parameters. d['s_dw'] = elements[61] d['s_jb_value'] = elements[65] d['s_jb_prob'] = elements[69] d['s_skew'] = elements[67] d['s_kurtosis'] = elements[71] d['s_omnibus_value'] = elements[59] d['s_omnibus_prob'] = elements[63] # Return return d
[docs] def evaluate(self, alpha=0.05): """This method set all the variables into this class. Notes: - if instead of having the attribute series it is desired to set each element of the series as an attribute, just used the following statement: setattr(self, name, value). - Note the difference between the resid and wresid since they will also have different statistical properties. Furthermore, there is a vector with normalized residuals resid_pearson. @see: statsmodels.WLS @see: pyAMR.core.regression.RegressionResultsWrapper Parameters ---------- alpha : the confidence interval Returns ------- dictionary : map with all the parameters. """ # Libraries. from statsmodels.iolib.summary import _getnames # Create series. d = {} # Add generic metrics. d['rsquared'] = self._raw.rsquared d['rsquared_adj'] = self._raw.rsquared_adj d['fvalue'] = self._raw.fvalue d['fprob'] = self._raw.f_pvalue d['aic'] = self._raw.aic d['bic'] = self._raw.bic d['llf'] = self._raw.llf d['mse_model'] = self._raw.mse_model d['mse_resid'] = self._raw.mse_resid d['mse_total'] = self._raw.mse_total # Create params information. params_data = zip(_getnames(self._raw)[1], self._raw.params, self._raw.bse, self._raw.tvalues, self._raw.pvalues, self._raw.conf_int(alpha)) # Add coefficients statistics. for name, coef, std, tvalue, pvalue, (cil, ciu) in params_data: d['%s_%s' % (name, 'coef')] = coef d['%s_%s' % (name, 'std')] = std d['%s_%s' % (name, 'tvalue')] = tvalue d['%s_%s' % (name, 'tprob')] = pvalue d['%s_%s' % (name, 'cil')] = cil d['%s_%s' % (name, 'ciu')] = ciu # Further statistics. d.update(self._params_from_summary()) d.update(self._resid_stats()) # Return return d
# -------------------------------------------------------------------------- # display methods # --------------------------------------------------------------------------
[docs] def as_summary(self, **kwargs): """This method creates a summary string. """ # Elements to split by. find = "=" * 78 # Split and fill. smry = find.join(self._raw.summary(**kwargs).as_text().split(find)[:-1]) # Add in new lines. smry += "Normal (N): %#25.3f Prob(N): %#29.3f\n" % (self.m_nm_value, self.m_nm_prob) smry += find # Return return smry
# -------------------------------------------------------------------------- # fit and predict methods # --------------------------------------------------------------------------
[docs] def fit(self, endog, exog=None, trend='n', weights=None, W=None, **kwargs): """This method computes the WLS. @see statsmodels.regression.linear_model.WLS Parameters ---------- endog: array-like The endogenous variable (aka. time series data) exog: array-like The exogenous variable (by default is the time t starting in 0) trend: str-like, options = {c, n} Wether to add a constant or not. weights: array-like (optional) The weights for the weighted least square regression. If weights and W are both not None, the W instance will be used to transform the weights variables. W: object-like (optional) The instance to transform the weights. It must implement the function 'weights'. kwargs: dict-like The rest of the arguments to pass to the __init__ and fit methods of the class statsmodels.WLS (see xxx) Returns ------- object : OLSWrapper object. """ # Create a time-series exogenous variable if exog is None: exog = np.arange(endog.size) # Format the exogenous variable to add constant. if trend == 'c': exog = sm.add_constant(copy.deepcopy(exog)) # Compute weights from frequency if weights is not None and W is not None: weights = W.weights(weights) if np.isnan(weights).any(): warnings.warn("""\n There was an error computing the weights. In order to avoid a fatal error, uniform weights will be set. The weight transformer used was: {}""".format(W)) weights = None # Set uniform weights if weights is None : weights = np.ones(endog.size) # Save all newly created parameters kwargs['exog'] = exog kwargs['endog'] = endog kwargs['trend'] = trend kwargs['weights'] = weights kwargs['W'] = W # Call parents fit. super(WLSWrapper, self).fit(**kwargs) # Save results. return self
[docs] def get_prediction(self, start=None, end=None, alpha=0.05, **kwargs): """This method predicts using the model. .. todo:: Note that wls_predict_std has weights (those used in WLS) as an input parameter. However, these are not passed to the function. Those weights are available for insample predictions but not for forecasting. Parameters ---------- start : int (optional) The time t to start the prediction end : int (optional) The time t to end the prediction kwargs: The arguments to pass to the method get_prediction of the class statsmodels.WLS (see xxx) Returns ------- time, prediction mean, prediction confidence interval """ # Create exogenous variable. if start is not None or end is not None: kwargs['exog'] = self._exog(start, end) # Compute prediction. prediction = self._raw.get_prediction(**kwargs) # Add time. time = self._time(start=start, end=end) # Get plotting values mean = prediction.predicted_mean cito = prediction.conf_int() pito = self.pred_int(start=start, end=end, alpha=alpha) # Number of insample prediction ninsample = np.sum(time < self.endog.size) # Fill insamples pito[:ninsample, :] = cito[:ninsample, :] # Create result return np.column_stack((time, mean, pito)).T
[docs] def line(self, x): """This method returns arrays to plot line and confidence intervals. """ if 'const_coef' in self._result: return self.x1_coef * x + self.const_coef else: return self.x1_coef * x
if __name__ == '__main__': # pragma: no cover # Import import sys import matplotlib as mpl import matplotlib.pyplot as plt import statsmodels.robust.norms as norms # import weights. from pyamr.datasets.load import make_timeseries from pyamr.metrics.weights import SigmoidA # ---------------------------- # set basic configuration # ---------------------------- # Matplotlib options mpl.rc('legend', fontsize=6) mpl.rc('xtick', labelsize=6) mpl.rc('ytick', labelsize=6) # Set pandas configuration. pd.set_option('display.max_colwidth', 14) pd.set_option('display.width', 150) pd.set_option('display.precision', 4) # ---------------------------- # create data # ---------------------------- # Create timeseries data x, y, f = make_timeseries() # Create method to compute weights from frequencies W = SigmoidA(r=200, g=0.5, offset=0.0, scale=1.0) # ---------------------------- # fit the model # ---------------------------- # Note that the function fit will call M.weights(weights) inside and will # store the M converter in the instance. Therefore, the code execute is # equivalent to <weights=M.weights(f)> with the only difference being that # the weight converter is not saved. wls = WLSWrapper(estimator=sm.WLS).fit(exog=x[:80], endog=y[:80], trend='c', weights=f[:80], W=W, missing='raise') # Print series. print("\nSeries:") print(wls.as_series()) # Print regression line. print("\nRegression line:") print(wls.line(np.arange(10))) # Print summary. print("\nSummary:") print(wls.as_summary()) # ----------------- # Save & Load # ----------------- # File location # fname = '../../examples/saved/wls-sample.pickle' # Save # wls.save(fname=fname) # Load # wls = WLSWrapper().load(fname=fname) # ------------- # Example I # ------------- # This example shows how to make predictions using the wrapper and how # to plot the resulting data. In addition, it compares the intervals # provided by get_prediction (confidence intervals) and the intervals # provided by wls_prediction_std (prediction intervals). # # To Do: Implement methods to compute CI and PI (see regression). # Variables. start, end = None, 180 # Compute predictions (exogenous?). preds = wls.get_prediction(start=start, end=end) # Create figure fig, ax = plt.subplots(1, 1, figsize=(11, 5)) # Plotting confidence intervals # ----------------------------- # Plot truth values. ax.plot(x, y, color='#A6CEE3', alpha=0.5, marker='o', markeredgecolor='k', markeredgewidth=0.5, markersize=5, linewidth=0.75, label='Observed') # Plot forecasted values. ax.plot(preds[0, :], preds[1, :], color='#FF0000', alpha=1.00, linewidth=2.0, label=wls._identifier(short=True)) print(preds.shape) # Plot the confidence intervals. ax.fill_between(x=preds[0, :], y1=preds[2, :], y2=preds[3, :], color='r', alpha=0.1) # Legend plt.legend() # ----------------------------- # Example II # ----------------------------- # This example performs grid search on a number of possible configurations # of the WLSWrapper. In particular, it tests the effect of different # objects to compute the weights from the frequencies. It presents both # the resulting pandas dataframe and also a figure. # Configuration # ------------- # This variable contains the weight functions to test. Note that in # the norms module there are other options such as [norms.HuberT(), # norms.Hampel(), norms.TrimmedMean(), norms.TukeyBiweight(), # norms.AndreWave(), norms.RamsayE()] w_func = [ norms.LeastSquares(), SigmoidA(r=200, g=0.5, offset=0.0, scale=1.0), SigmoidA(r=200, g=0.5, offset=0.0, scale=1.0, percentiles=[10, 90]), SigmoidA(r=200, g=0.5, offset=0.0, scale=1.0, percentiles=[25, 75]), SigmoidA(r=200, g=0.5, offset=0.0, scale=1.0, percentiles=[25, 90]), SigmoidA(r=200, g=0.5, offset=0.0, scale=1.0, percentiles=[40, 50])] # The grid search parameters. grid_params = [ # {'exog': [x], 'endog': [y], 'trend': ['c']}, {'exog': [x], 'endog': [y], 'trend': ['c'], 'weights': [f], 'W': w_func} ] # Grid search # ------------ # Perform grid search. summary = WLSWrapper(estimator=sm.WLS).grid_search(grid_params=grid_params) # Show grid results print("\nGrid search:") print(WLSWrapper().from_list_dataframe(summary).T) # Prediction # ---------- # Variables. start, end = 10, 150 # Create figure fig, axes = plt.subplots(1, 3, figsize=(10, 5)) # Plot truth values. axes[0].plot(x, y, color='#A6CEE3', alpha=0.5, marker='o', markeredgecolor='k', markeredgewidth=0.5, markersize=5, linewidth=0.75, label='Observed') # Plot frequencies axes[0].bar(x, f, color='gray', alpha=0.7, label='Frequency') # For each of the models in summary for i, model in enumerate(summary): # Compute predictions. preds = model.get_prediction(start=start, end=end) # Plot forecasted values. axes[0].plot(preds[0, :], preds[1, :], linewidth=1.0, label=model._identifier(short=True)) # Plot the confidence intervals. axes[0].fill_between(preds[0, :], preds[2, :], preds[3, :], alpha=0.1) # Plot weights assigned to each observation axes[1].plot(model.weights, marker='o', alpha=0.5, markeredgecolor='k', markeredgewidth=0.5, markersize=4, linewidth=0.00, label=model._identifier(short=True)) # Plot weights converter (W) functions. if model.W is not None: axes[2].plot(np.linspace(0, 1, 100), model.W.weights(np.linspace(0, 1, 100)), label=model._identifier(short=True)) # Grid. axes[0].grid(linestyle='--', linewidth=0.35, alpha=0.5) axes[1].grid(linestyle='--', linewidth=0.35, alpha=0.5) axes[2].grid(linestyle='--', linewidth=0.35, alpha=0.5) # Legend. axes[0].legend(loc=0) axes[1].legend(loc=0) axes[2].legend(loc=0) # Tight layout plt.tight_layout() # Show. # plt.show()