Question: ### Based off Chapter 3 of LMWR2 #options(width = 50) #nice Word formatting at 20pts options(digits = 5, scipen = 2, show.signif.stars = FALSE) #
### Based off Chapter 3 of LMWR2 #options(width = 50) #nice Word formatting at 20pts options(digits = 5, scipen = 2, show.signif.stars = FALSE) # read Galapagos data data(gala, package = "faraway") ### 3.2 ### Test whether any of the (non-constant) predictors should be in the model # Fit full model lmod = lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, data = gala) # Fit null model nullmod = lm(Species ~ 1, data = gala) # F test for all (non-constant) predicts anova(nullmod, lmod) # manually verify results (rss0 = deviance(nullmod)) (rss = deviance(lmod)) (df0 = df.residual(nullmod)) (df = df.residual(lmod)) (f = ((rss0 - rss)/(df0-df))/(rss/df)) 1 - pf(f, df1 = df0-df, df2 = df) # Test statistic and p-value available from summary function summary(lmod) ### Test whether regression coefficient for Area is significant # fit reduced model (full without Area) lmods <- lm(Species ~ Elevation + Nearest + Scruz + Adjacent, data = gala) anova(lmods, lmod) # notice p-values are the same and t1^2 = F library(faraway) sumary(lmod) # compare t1^2 and F summary(lmod)$coeff[2,3]^2 anova(lmods, lmod)$F[2] # compare results to test of beta_Area = 0 when no other predictors in model sumary(lm(Species ~ Area, gala)) # Test whether regression coefficients Area and Adjacent should simultaneously # be included, assuming Elevation, Nearest and Scruz are in our model # Determine full and reduced models lmods <- lm(Species ~ Elevation + Nearest + Scruz, data = gala) # fit reduced model anova(lmods, lmod) # compare models using general f-test ### Test whether regression coefficients for Area and Adjacent are equal, ### assuming other predictors in model # Fit linear subspace model lmods <- lm(Species ~ I(Adjacent + Area) + Elevation + Nearest + Scruz, gala) # The I() means evaulate this function before creating the model. # This allows us to create transformed predictors in the model statement. anova(lmods, lmod) # compare models using general f-test ### Test whether beta_Area = 0.5 assuming other predictors are in the model # Fit reduced mdoel lmods <- lm(Species ~ Area + offset(0.5*Elevation) + Nearest + Scruz + Adjacent, gala) # the offset term indicates that this term is a constant an not to be estimated. anova(lmods, lmod) # compare models using general f-test # Same test using t test (tstat <- (coef(lmod)[3] - 0.5)/sqrt(vcov(lmod)[3,3])) # test statistic 2 * (1 - pt(abs(tstat), df = df.residual(lmod))) # p-value tstat^2 ### 3.3 Permutation tests ### Test whether Nearest and Scruz predictors needed lmod <- lm(Species ~ Nearest + Scruz, data = gala) lms <- summary(lmod) # f statistic and p-value # Test statistic available from summary function lms$fstatistic 1 - pf(lms$fstatistic[1], lms$fstatistic[2], lms$fstatistic[3]) # Randomly sample responses (4000 times), recompute model and fstatistic nreps <- 4000 set.seed(123) # reproducible results fstats <- numeric(nreps) for(i in 1:nreps) { fstats[i] <- summary(lm(sample(Species) ~ Nearest + Scruz, gala)) $fstatistic[1] } # compute p-value (the proportion of simulated test statistics at least as extreme # as our observed test statistics). mean(fstats >= lms$fstat[1]) ### Test test whether the Scruz predictor should be in the model when Nearest is. # Test statistic available from summary function tstat <- lms$coef[3,3] # Randomly sample Scruz (4000 times), recompute model and tstatistic nreps <- 4000 tstats <- numeric(nreps) set.seed(123) # reproducability for(i in 1:nreps) { tstats[i] <- summary(lm(Species ~ Nearest + sample(Scruz), gala))$coef[3,3] } # compute p-value (the proportion of simulated test statistics at least as extreme # as our observed test statistics). mean(abs(tstats) >= abs(tstat)) #### 3.5 Confidence Intervals for Beta # Fit model lmod = lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, data = gala) sumary(lmod) qt(.975, df = df.residual(lmod)) # manually construct interval -.02394 + c(-1, 1) * 2.0639 * .02242 # construct 95% confidence intervals of all parameters confint(lmod) # construct 95% joint confidence intervals for beta_dpi and beta_pop75. library(ellipse) # construct plot of joint confidence region for Area, Adjacent coefficients plot(ellipse(lmod, c(2, 6)), type = "l", ylim = c(-0.13, 0)) # add origin and point estimates to plot points(0, 0) points(coef(lmod)[2], coef(lmod)[6]) # add vertical and horizontal lines for individual confidence intervals abline(v = confint(lmod)[2,], lty = 2) # plots vertical lines abline(h = confint(lmod)[6,], lty = 2) # plots horizontal lines #### 3.6 Bootstrap confidence intervals nreps = 4000 # simulation example n = 10 # number of observations betahats = matrix(0, nrow = nreps, ncol = 2) X = cbind(1, runif(n)) # fixed x beta = matrix(c(1, 2)) # fixed beta for(i in 1:nreps) { y = X %*% beta + rnorm(n) betahats[i,] = coef(lm(y ~ X - 1)) } # empirical density of betahat0 plot(density(betahats[,1]), main = "empirical density of betahat0") # quantiles form empirical distribution abline(v = quantile(betahats[,1], prob = c(.025, .975))) ### bootstrap example set.seed(123) nb = 4000 # number of bootstrap samples coefmat = matrix(0, nb, 6) resids = residuals(lmod) #extract residuals preds = fitted(lmod) # fitted values for(i in 1:nb) { booty <- preds + sample(resids, replace = TRUE) bmod <- update(lmod, booty ~ .) coefmat[i,] = coef(bmod) } # construct intervals colnames(coefmat) = c("Intercept", colnames(gala[,3:7])) coefmat <- data.frame(coefmat) apply(coefmat, 2, function(x) quantile(x, c(.025, .975))) # plot of densities and intervals coefdf = data.frame(value = c(coefmat), parameter = rep(names(coefmat), each = nb)) library(ggplot2) # x = Area means that we are only doing univariate plot # geom_density maps the variables values to the density geometry # geom_vline adds vertical lines at the specified values # theme_classic makes the plot look more like base R graphics ggplot(coefmat, aes(x = Area)) + geom_density() + geom_vline(xintercept = c(-.0628, .0185), lty = 2) + theme_classic() ggplot(coefmat, aes(x = Adjacent)) + geom_density() + geom_vline(xintercept = c(-.104, -.0409), lty = 2) + theme_classic() # just messing around # install.packages("ggthemes", # repos = "https://cran.rstudio.org") library(ggthemes) library(ggtech) p = ggplot(coefmat, aes(x = Adjacent)) + geom_density() + geom_vline(xintercept = c(-.104, -.0409), lty = 2) # base R p + theme_base() # wall street journal p + theme_wsj() # edward tufte style p + theme_tufte() # edward tufte style p + theme_tufte() # stata style p + theme_stata() # excel style p + theme_excel() # excel style p + theme_excel() # google docs style p + theme_gdocs() teengamb9121484.R juro Wed Mar 9 17:27:46 2016 #install.packages("faraway") library(faraway) packageVersion("faraway") ## [1] '1.0.7' data(teengamb,package="faraway") fullmod<-lm(gamble~sex+status+verbal+income,data=teengamb) summary(fullmod) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Call: lm(formula = gamble ~ sex + status + verbal + income, data = teengamb) Residuals: Min 1Q -51.082 -11.320 Median -1.451 3Q 9.452 Max 94.252 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 22.55565 17.19680 1.312 0.1968 sex -22.11833 8.21111 -2.694 0.0101 * status 0.05223 0.28111 0.186 0.8535 verbal -2.95949 2.17215 -1.362 0.1803 income 4.96198 1.02539 4.839 1.79e-05 *** --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 22.69 on 42 degrees of freedom Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816 F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06 redmodInc<-lm(gamble~income,data=teengamb) summary(redmodInc) ## ## ## ## ## ## ## ## ## ## Call: lm(formula = gamble ~ income, data = teengamb) Residuals: Min 1Q -46.020 -11.874 Median -3.757 3Q Max 11.934 107.120 Coefficients: Estimate Std. Error t value Pr(>|t|) 1 ## ## ## ## ## ## ## ## (Intercept) -6.325 6.030 -1.049 0.3 income 5.520 1.036 5.330 3.05e-06 *** --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 24.95 on 45 degrees of freedom Multiple R-squared: 0.387, Adjusted R-squared: 0.3734 F-statistic: 28.41 on 1 and 45 DF, p-value: 3.045e-06 anova(fullmod,redmodInc) ## ## ## ## ## ## ## ## ## Analysis of Variance Table Model 1: gamble ~ Model 2: gamble ~ Res.Df RSS Df 1 42 21624 2 45 28009 -3 --Signif. codes: 0 sex + status + verbal + income income Sum of Sq F Pr(>F) -6384.8 4.1338 0.01177 * '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 2 9121484woshom.R juro Wed Mar 9 21:33:43 2016 ############################################################################### ### Preliminaries: installing faraway package (if needed); ### load faraway pacakge; check package version; ### load the teengamb dataset; get info on dataset teengamb #install.packages("faraway") library(faraway) packageVersion("faraway") ## [1] '1.0.7' data(teengamb,package="faraway") ??teengamb ############################################################################### ### For A): Fit of the full model and summary of results for examining p-values. ### Last column gives p-values for the test of significance for including the ### variable in the model after all the other variables have been included: ### variables income and sex are each significant at the alpha=5%; p-value of ### income = 0.00001709 < 0.001 (marked by ***); p-value of sex just slightly ### bigger than 0.01 but smaller than 0.05 (marked by *). ### First column lists coefficient names, second column are parameter estimates ### from the fit of the full model, third column are the standard error for the ### estimates, fourth column is the statistic for the test of significance for ### including the variable after all the other variables have been included fullmod<-lm(gamble ~ sex +status +verbal +income, data=teengamb) summary(fullmod) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Call: lm(formula = gamble ~ sex + status + verbal + income, data = teengamb) Residuals: Min 1Q -51.082 -11.320 Median -1.451 3Q 9.452 Max 94.252 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 22.55565 17.19680 1.312 0.1968 sex -22.11833 8.21111 -2.694 0.0101 * status 0.05223 0.28111 0.186 0.8535 verbal -2.95949 2.17215 -1.362 0.1803 income 4.96198 1.02539 4.839 1.79e-05 *** --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 1 ## ## Residual standard error: 22.69 on 42 degrees of freedom ## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816 ## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06 ############################################################################### ### The 5th column can also be verified by, for example: ### redmod_income<-lm(gamble ~ sex +status + verbal, data=teengamb) ### anova(redmod_income,fullmod) ### The p-value appearing in the sixth column of the anova() output is ### the same as that in the 5th column in the summary() output of the ### full model for the varaible income; note also that the square root ### of the F-stat here (23.417) is the t-stat (4.389) in the summary() output redmod_income<-lm(gamble ~ sex +status + verbal, data=teengamb) anova(redmod_income,fullmod) ## ## ## ## ## ## ## ## ## Analysis of Variance Table Model 1: gamble ~ Model 2: gamble ~ Res.Df RSS Df 1 43 33680 2 42 21624 1 --Signif. codes: 0 sex + status + verbal sex + status + verbal + income Sum of Sq F Pr(>F) 12056 23.417 1.792e-05 *** '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ############################################################################### ### For B): The variable sex is an indicator variable (see output of ### help(faraway::teengamb)). It has value of 1 for females and ### 0 for males. Hence, all else being equal, the value of the ### coefficient for sex gives the difference in gamble between females ### and males; the estimated value from the model is -22.11833 pounds; ### i.e., all else being equal (status, verbal and income), females ### spend 22.11833 pounds less in gambling than their male counterparts ############################################################################### ### For C): Fit the reduced model gamble~income; get summary; ### The summary shows that income by itself explain about 38.9% of the variation ### in gamble as can be seen from the Multiple R-squared; ### the anova() output gives the F-test for comparing reduced model with the ### full model; ### This tests whether the additional variables jointly significantly explain ### the left-over variation in gamble after income has already been included ### in the model, and hence, if these additional variable should be ### simultaneouly included in the model after income has already been included ### the answer is yes at alpha=0.05; p-value of 0.01177 < 0.05 = 5% redmodInc<-lm(gamble~income,data=teengamb) summary(redmodInc) ## 2 ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Call: lm(formula = gamble ~ income, data = teengamb) Residuals: Min 1Q -46.020 -11.874 Median -3.757 3Q Max 11.934 107.120 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -6.325 6.030 -1.049 0.3 income 5.520 1.036 5.330 3.05e-06 *** --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 24.95 on 45 degrees of freedom Multiple R-squared: 0.387, Adjusted R-squared: 0.3734 F-statistic: 28.41 on 1 and 45 DF, p-value: 3.045e-06 anova(redmodInc,fullmod) ## ## ## ## ## ## ## ## ## Analysis of Variance Table Model 1: gamble ~ Model 2: gamble ~ Res.Df RSS Df 1 45 28009 2 42 21624 3 --Signif. codes: 0 income sex + status + verbal + income Sum of Sq F Pr(>F) 6384.8 4.1338 0.01177 * '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 3 ############################################################################### ### Preliminaries: installing faraway package (if needed); ### load faraway pacakge; check package version; ### load the teengamb dataset; get info on dataset teengamb #install.packages("faraway") library(faraway) packageVersion("faraway") data(teengamb,package="faraway") ??teengamb ############################################################################### ### For A): Fit of the full model and summary of results for examining p-values. ### Last column gives p-values for the test of significance for including the ### variable in the model after all the other variables have been included: ### variables income and sex are each significant at the alpha=5%; p-value of ### income = 0.00001709 < 0.001 (marked by ***); p-value of sex just slightly ### bigger than 0.01 but smaller than 0.05 (marked by *). ### First column lists coefficient names, second column are parameter estimates ### from the fit of the full model, third column are the standard error for the ### estimates, fourth column is the statistic for the test of significance for ### including the variable after all the other variables have been included fullmod<-lm(gamble ~ sex +status +verbal +income, data=teengamb) summary(fullmod) ############################################################################### ### The 5th column can also be verified by, for example: ### redmod_income<-lm(gamble ~ sex +status + verbal, data=teengamb) ### anova(redmod_income,fullmod) ### The p-value appearing in the sixth column of the anova() output is ### the same as that in the 5th column in the summary() output of the ### full model for the varaible income; note also that the square root ### of the F-stat here (23.417) is the t-stat (4.389) in the summary() output redmod_income<-lm(gamble ~ sex +status + verbal, data=teengamb) anova(redmod_income,fullmod) ############################################################################### ### For B): The variable sex is an indicator variable (see output of ### help(faraway::teengamb)). It has value of 1 for females and ### 0 for males. Hence, all else being equal, the value of the ### coefficient for sex gives the difference in gamble between females ### and males; the estimated value from the model is -22.11833 pounds; ### i.e., all else being equal (status, verbal and income), females ### spend 22.11833 pounds less in gambling than their male counterparts ############################################################################### ### For C): Fit the reduced model gamble~income; get summary; ### The summary shows that income by itself explain about 38.9% of the variation ### in gamble as can be seen from the Multiple R-squared; ### the anova() output gives the F-test for comparing reduced model with the ### full model; ### This tests whether the additional variables jointly significantly explain ### the left-over variation in gamble after income has already been included ### in the model, and hence, if these additional variable should be ### simultaneouly included in the model after income has already been included ### the answer is yes at alpha=0.05; p-value of 0.01177 < 0.05 = 5% redmodInc<-lm(gamble~income,data=teengamb) summary(redmodInc) anova(redmodInc,fullmod)