#' Baseball data from 1986-1987. The Hitters dataset contains #' information about 322 baseball players and 20 attributes as follows: #' # AtBat: Number of times at bat in 1986 # Hits: Number of hits in 1986 # HmRun: Number of home runs in 1986 # Runs: Number of runs in 1986 # RBI: Number of runs batted in in 1986 # Walks: Number of walks in 1986 # Years: Number of years in the major leagues # CAtBat: Number of times at bat during his career # CHits: Number of hits during his career # CHmRun: Number of home runs during his career # CRuns: Number of runs during his career # CRBI: Number of runs batted in during his career # CWalks: Number of walks during his career # League: A factor with levels A and N indicating player's league at the end of 1986 # Division: A factor with levels E and W indicating player's division at the end of 1986 # PutOuts: Number of put outs in 1986 # Assists: Number of assists in 1986 # Errors: Number of errors in 1986 # Salary: 1987 annual salary on opening day in thousands of dollars # NewLeague: A factor with levels A and N indicating player's league at the beginning of 1987 #' This dataset is available in the ISLR library. The target variable of interest here #' is the players' salaries. The target variable is missing for 59 of the players, thus #'we must omit those players from our analysis. setwd('/Users/shaina/Library/Mobile Documents/com~apple~CloudDocs/Data Mining 2020/Code') load("Hitters.Rdata") Hitters = na.omit(Hitters) #' Create training and validation index vectors. Do forward and backward selection on the training #' data. The regsubsets function will find "best" model for each number of variables. You can #' then retrieve the coefficients, however you have to multiply them into the data yourself (using #' the design matrix), you can't use the ever-fabulous predict() function. install.packages("leaps") library(leaps) # Create logical vectors that indicate whether an observation goes in training set or test set: set.seed(7515) train=sample(c(T,F), nrow(Hitters), rep=TRUE, p=c(0.6,0.4)) test=!train ############ # Run forward and Backward variable selection regfit.fwd = regsubsets(Salary~., data=Hitters[train, ], method="forward", nvmax=19) regfit.bwd = regsubsets(Salary~., data=Hitters[train, ], method="backward",nvmax=19) # Check out the order of the variables entering the models from each selection technique: regfit.fwd$vorder regfit.bwd$vorder # Create the design matrix for the test data so we can score it with the coefficients # from each of the candidate models. X = model.matrix(Salary~., data=Hitters[test, ]) #' Loop through and compute the validation MSE for each proposed model. There are 19 models #' from forward selection and 19 from backward. For each selection technique, we want to find #' the best number of parameters on out-of-sample data. fwd.val.MSE=vector() bwd.val.MSE=vector() for (i in 1:19) { beta.fwd = coef(regfit.fwd, i) beta.bwd = coef(regfit.bwd, i) pred.fwd = X[,names(beta.fwd)] %*% beta.fwd pred.bwd = X[,names(beta.bwd)] %*% beta.bwd fwd.val.MSE[i] = mean((Hitters$Salary[test] - pred.fwd)^2) bwd.val.MSE[i] = mean((Hitters$Salary[test] - pred.bwd)^2) } min(fwd.val.MSE) min(bwd.val.MSE) plot(1:19,fwd.val.MSE, xlab="Number of Inputs", ylab = "Validation MSE") plot(1:19,bwd.val.MSE, xlab="Number of Inputs", ylab = "Validation MSE") # Examine the resulting models (trained on training data) coef(regfit.fwd,10) coef(regfit.bwd,7) #' Finalize the resulting models on ALL of the data. Note that we run the variable selection method #' again! This is the classic CV idea, but you may choose not to. You could always train that specific #' model (with given variables) on all the data. You're likely to get a better model by #' re-running the variable selection technique. regfit.fwd.best = regsubsets(Salary~., data=Hitters, method="forward", nvmax=10) regfit.bwd.best = regsubsets(Salary~., data=Hitters, method="backward", nvmax=7) coef(regfit.fwd.best,10) coef(regfit.bwd.best,7) #' Indeed the best model for forward selection changed slightly when we re-ran the selection #' procedure. Runs leaves the model and Assists enters the model. These types of changes are #' not bad, but show evidence of multicollinearity in our variables. #' #' Let's check that out. #' By default the design matrix contains a column of all 1s for the intercept, so we remove #' that to compute the correlation matrix. X=model.matrix(Salary~. ,data=Hitters)[,-1] y = Hitters$Salary cor(X) #' Yikes! There is some pretty severe multicollinearity at play here. This is where #' Ridge regression really shines. ###################################################################### ###################################################################### ###################################################################### # Let's try Ridge Regression then... #' #' The glmnet package does not use a model formula but instead requires a design matrix #' with no intercept column, and a vector containing the target variable. #' #' This package kindly does all the work of cross-validation for us. We can choose the ridge #' parameter lambda by trying many different values and seeing which one works best on #' cross-validation. install.packages("glmnet") library(glmnet) set.seed(1) cv.out = cv.glmnet(X[train,], y[train], alpha=0) #' Only using training data #' for CV! plot(cv.out) # Find the value of lambda from cross-validation and see how the resulting model works on the # test data. bestlambda=cv.out$lambda.min bestlambda (ridge.mod.betas = coef(cv.out, s=bestlambda)) pred.ridge = predict(cv.out, s=bestlambda, newx=X[test,]) val.MSE.ridge = mean((pred.ridge - y[test])^2) val.MSE.ridge # Final step would be to re-calculate the model on ALL of the data. out=glmnet(X,y,alpha=0, lambda=bestlambda) ridge.coef = predict(out, type="coefficients") ridge.coef ###################################################################### ###################################################################### ###################################################################### #' Next, let's try LASSO. Again, just optimize the parameter lambda through #' Cross-Validation, seeing what provides the lowest MSE. #' To Run Lasso instead of ridge, simply change alpha to 1. set.seed(1) cv.out=cv.glmnet(X[train,],y[train],alpha=1) plot(cv.out) #' From here, we can grab the best lambda and check the model on our test data. bestlambda=cv.out$lambda.min (lasso.mod.betas = coef(cv.out, s=bestlambda)) pred.lasso = predict(cv.out, s=bestlambda, newx=X[test,]) (val.MSE.lasso = mean((pred.lasso-y[test])^2)) #' Results comparable to the other methods. If we want to go with this method, #' we'd just update the coefficients (by running the entire optimization again) #' on the entire dataset. out=glmnet(X,y,alpha=1,lambda=bestlambda) lasso.coef=predict(out, type="coefficients") lasso.coef #' Notice the sparsity in the parameter estimates - this is the benefit of LASSO over #' Ridge Regression. # Last thing to try if time permits is the ELASTIC NET set.seed(1) cv.out=cv.glmnet(X[train,],y[train],alpha=0.5) plot(cv.out) bestlambda=cv.out$lambda.min (elastic.mod.betas = coef(cv.out, s=bestlambda)) pred.elastic = predict(cv.out, s=bestlambda, newx=X[test,]) (val.MSE.elastic = mean((pred.elastic-y[test])^2)) ################################################################################ ################################################################################ ################################################################################ # A WORD ABOUT P-VALUES AND ATTEMPTING TO ESTIMATE STATISTICAL SIGNIFICANCE ################################################################################ ################################################################################ ################################################################################ #' "It is a very natural question to ask for standard errors of regression coefficients or other estimated quantities. #' In principle such standard errors can easily be calculated, e.g. using the bootstrap. Still, this package deliberately #' does not provide them. The reason for this is that standard errors are not very meaningful for strongly biased #' estimates such as arise from penalized estimation methods. Penalized estimation is a procedure that reduces the #' variance of estimators by introducing substantial bias. The bias of each estimator is therefore a major component #' of its mean squared error, whereas its variance may contribute only a small part. Unfortunately, in most applications #' of penalized regression it is impossible to obtain a sufficiently precise estimate of the bias. Any bootstrap-based #' calculations can only give an assessment of the variance of the estimates. Reliable estimates of the bias are only #' available if reliable unbiased estimates are available, which is typically not the case in situations in which #' penalized estimates are used. Reporting a standard error of a penalized estimate therefore tells only part of the story. #' It can give a mistaken impression of great precision, completely ignoring the inaccuracy caused by the bias. It is #' certainly a mistake to make confidence statements that are only based on an assessment of the variance of the estimates, #' such as bootstrap-based confidence intervals do." #' # ----Jelle Goeman, Ph.D. Leiden University, Author of the Penalized package in R.