Minggu, 15 Agustus 2021

Regresi Linear dan Validasi

     

        Regresi linear merupakan metode yang telah dikenal sejak abad ke-19. Regresi linear berganda dapat dinyatakan sebagai y = f(x), dengan 


Dalam model tersebut, Xj merupakan variabel independen yang dapat berupa 
- Variabel kuantitatif, hasil transformasinya, maupun ekspansinya
- Variabel boneka
- Interaksi antarvariabel

Contoh Regresi dengan R


- Data Auto pada pustaka (library) ISLR memuat sejumlah informasi terkait  mobil, di antaranya jarak yang dapat ditempuh per galon bahan bakar (mpg), volume silinder, daya (horsepower), berat (weight), akselerasi, dan merk mobil.

- Ingin diketahui variabel apa saja yang mempengaruhi jarak tempuh per galon bahan bakar (mpg).



> library(ISLR)

> head(Auto)

  mpg cylinders displacement horsepower weight acceleration year origin

1  18         8          307        130   3504         12.0   70      1

2  15         8          350        165   3693         11.5   70      1

3  18         8          318        150   3436         11.0   70      1

4  16         8          304        150   3433         12.0   70      1

5  17         8          302        140   3449         10.5   70      1

6  15         8          429        198   4341         10.0   70      1

                       name

1 chevrolet chevelle malibu

2         buick skylark 320

3        plymouth satellite

4             amc rebel sst

5               ford torino

6          ford galaxie 500

> attach(Auto)

> plot(cylinders, mpg)

 

 

> plot(displacement, mpg)

 

 

> plot(horsepower,mpg)

 

 

> plot(weight,mpg)

 

 

> plot(acceleration,mpg)

 

 

> plot(year,mpg)



 

> plot(origin,mpg)

 

 

> #regresi linear ganda

> m1 <- lm(mpg~cylinders+displacement+horsepower+weight+acceleration+year+origin, data = Auto)

> summary(m1)

 

Call:

lm(formula = mpg ~ cylinders + displacement + horsepower + weight +

    acceleration + year + origin, data = Auto)

 

Residuals:

    Min      1Q  Median      3Q     Max

-9.5903 -2.1565 -0.1169  1.8690 13.0604

 

Coefficients:

               Estimate Std. Error t value Pr(>|t|)   

(Intercept)  -17.218435   4.644294  -3.707  0.00024 ***

cylinders     -0.493376   0.323282  -1.526  0.12780   

displacement   0.019896   0.007515   2.647  0.00844 **

horsepower    -0.016951   0.013787  -1.230  0.21963   

weight        -0.006474   0.000652  -9.929  < 2e-16 ***

acceleration   0.080576   0.098845   0.815  0.41548   

year           0.750773   0.050973  14.729  < 2e-16 ***

origin         1.426141   0.278136   5.127 4.67e-07 ***

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 3.328 on 384 degrees of freedom

Multiple R-squared:  0.8215,  Adjusted R-squared:  0.8182

F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

 

> m2 <- lm(mpg~cylinders+displacement+horsepower+weight+year+origin, data = Auto)

> summary(m2)

 

Call:

lm(formula = mpg ~ cylinders + displacement + horsepower + weight +

    year + origin, data = Auto)

 

Residuals:

    Min      1Q  Median      3Q     Max

-9.7604 -2.1791 -0.1535  1.8524 13.1209

 

Coefficients:

               Estimate Std. Error t value Pr(>|t|)   

(Intercept)  -1.556e+01  4.175e+00  -3.728 0.000222 ***

cylinders    -5.067e-01  3.227e-01  -1.570 0.117236   

displacement  1.927e-02  7.472e-03   2.579 0.010287 * 

horsepower   -2.389e-02  1.084e-02  -2.205 0.028031 * 

weight       -6.218e-03  5.714e-04 -10.883  < 2e-16 ***

year          7.475e-01  5.079e-02  14.717  < 2e-16 ***

origin        1.428e+00  2.780e-01   5.138 4.43e-07 ***

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 3.326 on 385 degrees of freedom

Multiple R-squared:  0.8212,  Adjusted R-squared:  0.8184

F-statistic: 294.6 on 6 and 385 DF,  p-value: < 2.2e-16

 

> m3 <- lm(mpg~displacement+horsepower+weight+year+origin, data = Auto)

> summary(m3)

 

Call:

lm(formula = mpg ~ displacement + horsepower + weight + year +

    origin, data = Auto)

 

Residuals:

    Min      1Q  Median      3Q     Max

-9.4882 -2.1157 -0.1645  1.8650 13.0544

 

Coefficients:

               Estimate Std. Error t value Pr(>|t|)   

(Intercept)  -1.669e+01  4.120e+00  -4.051 6.16e-05 ***

displacement  1.137e-02  5.536e-03   2.054   0.0406 * 

horsepower   -2.192e-02  1.078e-02  -2.033   0.0428 * 

weight       -6.324e-03  5.685e-04 -11.124  < 2e-16 ***

year          7.484e-01  5.089e-02  14.707  < 2e-16 ***

origin        1.385e+00  2.772e-01   4.998 8.80e-07 ***

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 3.333 on 386 degrees of freedom

Multiple R-squared:   0.82,   Adjusted R-squared:  0.8177

F-statistic: 351.7 on 5 and 386 DF,  p-value: < 2.2e-16

 

> hors2 <- horsepower^2

> weig2 <- weight^2

> Auto2 <- cbind(Auto, hors2, weig2)

> m4 <- lm(mpg~horsepower+hors2+weight+weig2+year+origin, data = Auto2)

> summary(m4)

 

Call:

lm(formula = mpg ~ horsepower + hors2 + weight + weig2 + year +

    origin, data = Auto2)

 

Residuals:

    Min      1Q  Median      3Q     Max

-8.8841 -1.7292 -0.1211  1.5860 12.1360

 

Coefficients:

              Estimate Std. Error t value Pr(>|t|)   

(Intercept)  2.602e+00  4.082e+00   0.637  0.52426   

horsepower  -1.593e-01  2.960e-02  -5.383 1.27e-07 ***

hors2        5.088e-04  1.042e-04   4.882 1.55e-06 ***

weight      -1.481e-02  1.838e-03  -8.056 1.00e-14 ***

weig2        1.568e-06  2.699e-07   5.809 1.32e-08 ***

year         7.792e-01  4.471e-02  17.426  < 2e-16 ***

origin       6.685e-01  2.372e-01   2.818  0.00508 **

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 2.915 on 385 degrees of freedom

Multiple R-squared:  0.8626,  Adjusted R-squared:  0.8605

F-statistic:   403 on 6 and 385 DF,  p-value: < 2.2e-16

 

>

> #validasi model regresi

> nrow(Auto)

[1] 392

> set.seed(10)

> train1 <- sample(392,196)

> set.seed(20)

> train2 <- sample(392,196)

> head(train1)

[1] 137 330 368  72 211 344

> head(train2)

[1] 166 191 363 376 130 249

> m31 <- lm(mpg~displacement+horsepower+weight+year+origin, data = Auto, subset = train1)

> summary(m31)

 

Call:

lm(formula = mpg ~ displacement + horsepower + weight + year +

    origin, data = Auto, subset = train1)

 

Residuals:

    Min      1Q  Median      3Q     Max

-8.8324 -2.1154 -0.0379  1.8378 11.9800

 

Coefficients:

               Estimate Std. Error t value Pr(>|t|)   

(Intercept)  -1.454e+01  5.379e+00  -2.703  0.00749 **

displacement  8.726e-03  7.790e-03   1.120  0.26409   

horsepower   -3.019e-02  1.491e-02  -2.024  0.04432 * 

weight       -5.556e-03  7.915e-04  -7.020 3.82e-11 ***

year          7.103e-01  6.748e-02  10.526  < 2e-16 ***

origin        1.144e+00  3.908e-01   2.927  0.00384 **

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 3.278 on 190 degrees of freedom

Multiple R-squared:  0.8166,  Adjusted R-squared:  0.8118

F-statistic: 169.2 on 5 and 190 DF,  p-value: < 2.2e-16

 

> m32 <- lm(mpg~displacement+horsepower+weight+year+origin, data = Auto, subset = train2)

> summary(m32)

 

Call:

lm(formula = mpg ~ displacement + horsepower + weight + year +

    origin, data = Auto, subset = train2)

 

Residuals:

   Min     1Q Median     3Q    Max

-8.053 -2.345 -0.156  1.829 13.245

 

Coefficients:

               Estimate Std. Error t value Pr(>|t|)   

(Intercept)  -1.364e+01  6.288e+00  -2.170  0.03126 * 

displacement  7.400e-03  7.475e-03   0.990  0.32345   

horsepower   -1.319e-02  1.491e-02  -0.884  0.37758   

weight       -6.474e-03  7.569e-04  -8.554 3.92e-15 ***

year          7.115e-01  7.950e-02   8.949 3.24e-16 ***

origin        1.321e+00  4.165e-01   3.172  0.00177 **

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 3.34 on 190 degrees of freedom

Multiple R-squared:  0.8238,  Adjusted R-squared:  0.8192

F-statistic: 177.7 on 5 and 190 DF,  p-value: < 2.2e-16

 

> m41 <- lm(mpg~poly(horsepower,2)+poly(weight,2)+year+origin, data = Auto, subset = train1)

> summary(m41)

 

Call:

lm(formula = mpg ~ poly(horsepower, 2) + poly(weight, 2) + year +

    origin, data = Auto, subset = train1)

 

Residuals:

    Min      1Q  Median      3Q     Max

-8.3721 -1.8389 -0.1336  1.4726 11.0388

 

Coefficients:

                      Estimate Std. Error t value Pr(>|t|)   

(Intercept)          -38.57462    4.72545  -8.163 4.51e-14 ***

poly(horsepower, 2)1 -27.63378    9.03225  -3.059 0.002540 **

poly(horsepower, 2)2  13.19552    5.41544   2.437 0.015751 * 

poly(weight, 2)1     -76.66090   10.14388  -7.557 1.72e-12 ***

poly(weight, 2)2      21.36900    5.69954   3.749 0.000236 ***

year                   0.80114    0.06267  12.784  < 2e-16 ***

origin                 0.56215    0.34823   1.614 0.108124   

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 2.955 on 189 degrees of freedom

Multiple R-squared:  0.8517,  Adjusted R-squared:  0.847

F-statistic:   181 on 6 and 189 DF,  p-value: < 2.2e-16

 

> m42 <- lm(mpg~poly(horsepower,2)+poly(weight,2)+year+origin, data = Auto, subset = train2)

> summary(m42)

 

Call:

lm(formula = mpg ~ poly(horsepower, 2) + poly(weight, 2) + year +

    origin, data = Auto, subset = train2)

 

Residuals:

    Min      1Q  Median      3Q     Max

-6.4428 -1.6895 -0.0122  1.4802 12.2928

 

Coefficients:

                      Estimate Std. Error t value Pr(>|t|)   

(Intercept)          -35.19914    5.28107  -6.665 2.82e-10 ***

poly(horsepower, 2)1 -19.47293    9.01539  -2.160 0.032035 * 

poly(horsepower, 2)2  17.91697    5.17801   3.460 0.000667 ***

poly(weight, 2)1     -88.38118   10.19010  -8.673 1.90e-15 ***

poly(weight, 2)2      22.67798    5.33780   4.249 3.37e-05 ***

year                   0.76087    0.07059  10.778  < 2e-16 ***

origin                 0.50389    0.36643   1.375 0.170719   

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 

Residual standard error: 2.943 on 189 degrees of freedom

Multiple R-squared:  0.8639,  Adjusted R-squared:  0.8596

F-statistic: 199.9 on 6 and 189 DF,  p-value: < 2.2e-16

 

>

> mean((mpg-predict(m31, Auto))[-train1]^2)

[1] 11.84881

> mean((mpg-predict(m32, Auto))[-train2]^2)

[1] 11.24764

> mean((mpg-predict(m41, Auto))[-train1]^2)

[1] 8.651271

> mean((mpg-predict(m42, Auto))[-train2]^2)

[1] 8.428948

>

> #validasi LOOCV

> library(boot)

 

> m3a <- glm(mpg~displacement+horsepower+weight+year+origin, data = Auto)

> m4a <- glm(mpg~poly(horsepower,2)+poly(weight,2)+year+origin, data = Auto)

> cv3 <- cv.glm(Auto, m3a)

> cv4 <- cv.glm(Auto, m4a)

> cv3$delta

[1] 11.30361 11.30313

> cv4$delta

[1] 8.681069 8.680636

>

> #validasi k-cross validation

> cv.glm(Auto, m3a, K=5)$delta

[1] 11.24819 11.21356

> cv.glm(Auto, m4a, K=5)$delta

[1] 8.756626 8.710914

> cv.glm(Auto, m3a, K=10)$delta

[1] 11.20207 11.18776

> cv.glm(Auto, m4a, K=10)$delta

[1] 8.749589 8.728033


Tidak ada komentar:
Write komentar

Games