setwd("C:/Users/user/Desktop/政大/TA/迴歸分析TA/講義/111-2/0612")
PatientSatisfaction <- read.table("CH06PR15.txt", header = F)
head(PatientSatisfaction)
## V1 V2 V3 V4
## 1 48 50 51 2.3
## 2 57 36 46 2.3
## 3 66 40 48 2.2
## 4 70 41 44 1.8
## 5 89 28 43 1.8
## 6 36 49 54 2.9
colnames(PatientSatisfaction) <- c("Y", "X1", "X2", "X3")
head(PatientSatisfaction)
## Y X1 X2 X3
## 1 48 50 51 2.3
## 2 57 36 46 2.3
## 3 66 40 48 2.2
## 4 70 41 44 1.8
## 5 89 28 43 1.8
## 6 36 49 54 2.9
attach(PatientSatisfaction)
m1 <- lm(Y ~ X1)
summary(m1)
##
## Call:
## lm(formula = Y ~ X1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.9281 -9.6808 0.7573 10.8913 17.7986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 119.9432 7.0848 16.930 < 2e-16 ***
## X1 -1.5206 0.1799 -8.455 9.06e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 44 degrees of freedom
## Multiple R-squared: 0.619, Adjusted R-squared: 0.6103
## F-statistic: 71.48 on 1 and 44 DF, p-value: 9.058e-11
From R output, we know that \(R^2_{Y1} = 0.619\).
m2 <- lm(Y ~ X2)
anova(m2)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X2 1 4860.3 4860.3 25.132 9.23e-06 ***
## Residuals 44 8509.0 193.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m21 <- lm(Y ~ X2 + X1)
anova(m21)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X2 1 4860.3 4860.3 45.305 3.161e-08 ***
## X1 1 3896.0 3896.0 36.317 3.348e-07 ***
## Residuals 43 4613.0 107.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From R output, we know that \(R^2_{Y1|2} = \frac{SSR(X_1|X_2)}{SSE(X_2)} = \frac{3896}{8509} \approx 0.4579\).
m23 <- lm(Y ~ X2 + X3)
anova(m23)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X2 1 4860.3 4860.3 29.4089 2.507e-06 ***
## X3 1 1402.7 1402.7 8.4873 0.005653 **
## Residuals 43 7106.4 165.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m123 <- lm(Y ~ X1 + X2 + X3)
anova(m123)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X1 1 8275.4 8275.4 81.8026 2.059e-11 ***
## X2 1 480.9 480.9 4.7539 0.03489 *
## X3 1 364.2 364.2 3.5997 0.06468 .
## Residuals 42 4248.8 101.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From R output, we know that \(R^2_{Y1|23} = \frac{SSR(X_1|X_2, X_3)}{SSE(X_2, X_3)} = \frac{SSE(X_2, X_3) - SSE(X_1, X_2, X_3)}{SSE(X_2, X_3)} = \frac{7106.4 - 4248.8}{7106.4} \approx 0.4021\).
Hence the degree of marginal linear association between \(Y\) and \(X_1\) decreases by roughly a third when adjusted for \(X_2\). It decreases only a little more when adjusted for \(X_2\) and \(X_3\).
data <- read.table("CH09PR10.txt")
names(data) <- c("Y","x1","x2","x3","x4")
head(data)
## Y x1 x2 x3 x4
## 1 88 86 110 100 87
## 2 80 62 97 99 100
## 3 96 110 107 103 103
## 4 76 101 117 93 95
## 5 80 100 101 95 88
## 6 73 78 85 95 84
par(mfrow = c(2, 2))
boxplot(data$x1, main = "BoxPlot of Test 1", ylab = "Job Proficiency")
boxplot(data$x2, main = "BoxPlot of Test 2", ylab = "Job Proficiency")
boxplot(data$x3, main = "BoxPlot of Test 3", ylab = "Job Proficiency")
boxplot(data$x4, main = "BoxPlot of Test 4", ylab = "Job Proficiency")
There is a outlier in the Test 1.
# Basic Scatterplot Matrix
pairs(~Y + x1 + x2 + x3 + x4, data, main = "Simple Scatterplot Matrix")
library("Hmisc")
# cor(data)
rcorr(as.matrix(data))$r
## Y x1 x2 x3 x4
## Y 1.0000000 0.5144107 0.4970057 0.8970645 0.8693865
## x1 0.5144107 1.0000000 0.1022689 0.1807692 0.3266632
## x2 0.4970057 0.1022689 1.0000000 0.5190448 0.3967101
## x3 0.8970645 0.1807692 0.5190448 1.0000000 0.7820385
## x4 0.8693865 0.3266632 0.3967101 0.7820385 1.0000000
data.lm <- lm(Y ~., data)
summary(data.lm)
##
## Call:
## lm(formula = Y ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9779 -3.4506 0.0941 2.4749 5.9959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -124.38182 9.94106 -12.512 6.48e-11 ***
## x1 0.29573 0.04397 6.725 1.52e-06 ***
## x2 0.04829 0.05662 0.853 0.40383
## x3 1.30601 0.16409 7.959 1.26e-07 ***
## x4 0.51982 0.13194 3.940 0.00081 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.099 on 20 degrees of freedom
## Multiple R-squared: 0.9629, Adjusted R-squared: 0.9555
## F-statistic: 129.7 on 4 and 20 DF, p-value: 5.262e-14
\(\hat{Y} = −124.38 + 0.30 x_1 + 0.05 x_2 + 1.31 x_3 + 0.52 x_4\)
Since only \(\beta_2\) isn’t significant and the \(F\) statistics of the overall model is significant, I think all predictor variables should be retained in the model.
library("ALSM")
library("leaps")
BestSub(data[, 2:5], data[, 1], method = 'r2adj', num = 4)
## p 1 2 3 4 SSEp r2 r2.adj Cp AICp SBCp
## 1 2 0 0 1 0 1768.0228 0.8047247 0.7962344 84.246496 110.46853 112.90629
## 1 2 0 0 0 1 2210.6887 0.7558329 0.7452170 110.597414 116.05459 118.49234
## 1 2 1 0 0 0 6658.1453 0.2646184 0.2326452 375.344689 143.61801 146.05576
## 1 2 0 1 0 0 6817.5291 0.2470147 0.2142762 384.832454 144.20941 146.64717
## 2 3 1 0 1 0 606.6574 0.9329956 0.9269043 17.112978 85.72721 89.38384
## 2 3 0 0 1 1 1111.3126 0.8772573 0.8660988 47.153985 100.86053 104.51716
## 2 3 1 0 0 1 1672.5853 0.8152656 0.7984716 80.565307 111.08125 114.73788
## 2 3 0 1 1 0 1755.8127 0.8060733 0.7884436 85.519650 112.29528 115.95191
## 3 4 1 0 1 1 348.1970 0.9615422 0.9560482 3.727399 73.84732 78.72282
## 3 4 1 1 1 0 596.7207 0.9340931 0.9246779 18.521465 87.31433 92.18984
## 3 4 0 1 1 1 1095.8078 0.8789698 0.8616797 48.231020 102.50928 107.38479
## 3 4 1 1 0 1 1400.1275 0.8453581 0.8232664 66.346500 108.63607 113.51157
## 4 5 1 1 1 1 335.9775 0.9628918 0.9554702 5.000000 74.95421 81.04859
## PRESSp
## 1 2064.5976
## 1 2548.6349
## 1 7791.5994
## 1 7991.0964
## 2 760.9744
## 2 1449.6001
## 2 2109.8967
## 2 2206.6460
## 3 471.4520
## 3 831.1521
## 3 1570.5610
## 3 1885.8454
## 4 518.9885
The four best subset regression models are
subset | \(R^2_{a,p}\) |
---|---|
\(x_1, x_3, x_4\) | 0.9560482 |
\(x_1, x_2, x_3, x_4\) | 0.9554702 |
\(x_1, x_3\) | 0.9269043 |
\(x_1, x_2, x_3\) | 0.9246779 |
There are \(C_p\) Criterion, \(AIC_p\) and \(SBC_p\) that I can use to help select the best model. They all place penalties for adding predictors.