R 다중회귀분석 예제 - r dajunghoegwibunseog yeje


# 다중회귀분석 (Multiple Regression Model)# programmer 20명 # salary가 experience(경력년수), score (직무적성검사성적)과 연관성을 갖는지 검증. df <- read.csv("data/salary.csv") head(df)## experience score salary ## 1 4 78 24.0 ## 2 7 100 43.0 ## 3 1 86 23.7 ## 4 5 82 34.3 ## 5 8 86 35.8 ## 6 10 84 38.0summary(df)## experience score salary ## Min. : 0.00 Min. : 70.00 Min. :22.20 ## 1st Qu.: 3.00 1st Qu.: 77.25 1st Qu.:27.80 ## Median : 5.50 Median : 82.50 Median :30.85 ## Mean : 5.20 Mean : 82.75 Mean :31.23 ## 3rd Qu.: 7.25 3rd Qu.: 87.25 3rd Qu.:34.67 ## Max. :10.00 Max. :100.00 Max. :43.00library(psych) pairs.panels(df) # salary ~ experience 상관계수 0.86# 단순회귀 : 경력 증가시 연봉 증가 상관관계 model <- lm(salary ~ experience, data = df) summary(model)## ## Call: ## lm(formula = salary ~ experience, data = df) ## ## Residuals: ## Min 1Q Median 3Q Max ## -5.291 -1.441 0.249 0.719 8.849 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 22.8111 1.3761 16.576 2.39e-12 *** ## experience 1.6200 0.2313 7.004 1.54e-06 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.991 on 18 degrees of freedom ## Multiple R-squared: 0.7316, Adjusted R-squared: 0.7167 ## F-statistic: 49.06 on 1 and 18 DF, p-value: 1.541e-06# 다중회귀 : 경력 증가시 적성검사 점수 증가로 인한 연봉 증가까지 포함된 관계 # experience ~ score 의 cor() = 0.34 # model <- lm(salary ~ ., data = df) model <- lm(salary ~ experience + score, data = df) summary(model)## ## Call: ## lm(formula = salary ~ experience + score, data = df) ## ## Residuals: ## Min 1Q Median 3Q Max ## -4.3586 -1.4581 -0.0341 1.1862 4.9102 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 3.17394 6.15607 0.516 0.61279 ## experience 1.40390 0.19857 7.070 1.88e-06 *** ## score 0.25089 0.07735 3.243 0.00478 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.419 on 17 degrees of freedom ## Multiple R-squared: 0.8342, Adjusted R-squared: 0.8147 ## F-statistic: 42.76 on 2 and 17 DF, p-value: 2.328e-07# 추정된 회귀식 # salary = 3.174 + 1.404 * experience + 0.251 * score # b1 : b2(score)가 일정하다고 할 때, experience가 1년 증가하면 salary가 $1,404 증가할 것으로 기대된다. # b2 : b1(experience)가 일정하다고 할 때, score가 1점 증가하면 salary가 $251 증가할 것으로 기대된다.# 다중회귀분석 결과 해석 # (1) Adjusted R-squared # R-squared: 0.83 --> experience와 score가 salary 변동량의 83%를 설명한다. # But, 설명변수 갯수가 증가하면 결정계수도 증가 # --> 설명변수 갯수에 대한 패널티 적용한 결정계수 = Adjusted R-squared # (2) F-test # H0 : b1 = b2 = ...... = bk = 0 # 종속변수와 모든 독립(설명)변수 집합간에 유의한 관계가 존재하는지 검정 # b0 는 큰 의미가 없다. # (3) T-test # H0 : bi = 0 # 각 개별 독립변수의 유의성 검정 # (4) 잔차분석 --> Residuals plot / Normal Q-Q plot / Leverage plot# 영향점이 있는 경우 plot(model) # --> Leverage plot에서 2번째 자료가 이상치 & 영향점dcolor <- rep(1, length(df$salary)) dcolor[2] = 2 pairs(df, col = dcolor, pch = dcolor) # 2번 자료만 다르게 표시# 영향점 제거는 주관적으로 판단하는 수밖에 없다. df2 <- df[-2, ] # 영향점 제거할 경우 pairs.panels(df2) # salary ~ experience 상관계수 높아짐(0.91). 다른 상관계수는 낮아짐.model2 <- lm(salary ~ experience + score, data = df2) summary(model2) # score 회귀계수가 유의하지 않다.## ## Call: ## lm(formula = salary ~ experience + score, data = df2) ## ## Residuals: ## Min 1Q Median 3Q Max ## -4.5221 -1.4259 0.1133 1.3351 3.8131 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 12.31238 5.93883 2.073 0.0547 . ## experience 1.42607 0.16426 8.682 1.89e-07 *** ## score 0.13469 0.07486 1.799 0.0909 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.999 on 16 degrees of freedom ## Multiple R-squared: 0.8592, Adjusted R-squared: 0.8416 ## F-statistic: 48.83 on 2 and 16 DF, p-value: 1.542e-07# 추정과 예측 # 경력 5년, 적성검사성적 80점인 사람과 경력 10년, 성적 70점인 사람의 연봉 예측 # 평균 연봉의 95% 신뢰구간 predict(model, data.frame("experience" = c(5,10), "score" = c(80,70)), interval = "confidence")## fit lwr upr ## 1 30.26428 29.04555 31.48302 ## 2 34.77494 31.24174 38.30815# 새로운 한 명에 대한 95% 예측구간 predict(model, data.frame("experience" = c(5,10), "score" = c(80,70)), interval = "prediction")## fit lwr upr ## 1 30.26428 25.01763 35.51094 ## 2 34.77494 28.56804 40.98184# 다중공선성 (Multicollinearity)# 독립변수들이 서로 높은 상관관계를 가지면 회귀계수의 정확한 추정이 어렵다. # ---> 모형 선택 방법론을 적용하여 가장 적절한 변수를 선택할 수 있다. # 30개 부서에서 부서당 35명의 직원 설문조사 # 데이터 숫자는 해당 질문에 긍정한 직원의 비율 attitude## rating complaints privileges learning raises critical advance ## 1 43 51 30 39 61 92 45 ## 2 63 64 51 54 63 73 47 ## 3 71 70 68 69 76 86 48 ## 4 61 63 45 47 54 84 35 ## 5 81 78 56 66 71 83 47 ## 6 43 55 49 44 54 49 34 ## 7 58 67 42 56 66 68 35 ## 8 71 75 50 55 70 66 41 ## 9 72 82 72 67 71 83 31 ## 10 67 61 45 47 62 80 41 ## 11 64 53 53 58 58 67 34 ## 12 67 60 47 39 59 74 41 ## 13 69 62 57 42 55 63 25 ## 14 68 83 83 45 59 77 35 ## 15 77 77 54 72 79 77 46 ## 16 81 90 50 72 60 54 36 ## 17 74 85 64 69 79 79 63 ## 18 65 60 65 75 55 80 60 ## 19 65 70 46 57 75 85 46 ## 20 50 58 68 54 64 78 52 ## 21 50 40 33 34 43 64 33 ## 22 64 61 52 62 66 80 41 ## 23 53 66 52 50 63 80 37 ## 24 40 37 42 58 50 57 49 ## 25 63 54 42 48 66 75 33 ## 26 66 77 66 63 88 76 72 ## 27 78 75 58 74 80 78 49 ## 28 48 57 44 45 51 83 38 ## 29 85 85 71 71 77 74 55 ## 30 82 82 39 59 64 78 39round(cor(attitude),3)## rating complaints privileges learning raises critical advance ## rating 1.000 0.825 0.426 0.624 0.590 0.156 0.155 ## complaints 0.825 1.000 0.558 0.597 0.669 0.188 0.225 ## privileges 0.426 0.558 1.000 0.493 0.445 0.147 0.343 ## learning 0.624 0.597 0.493 1.000 0.640 0.116 0.532 ## raises 0.590 0.669 0.445 0.640 1.000 0.377 0.574 ## critical 0.156 0.188 0.147 0.116 0.377 1.000 0.283 ## advance 0.155 0.225 0.343 0.532 0.574 0.283 1.000pairs.panels(attitude)# cor : complaints + learning = 0.597 # cor : complaints + raises = 0.669 plot(attitude[ , c("rating", "complaints", "learning")])a <- lm(rating ~ complaints + learning, data = attitude) summary(a)## ## Call: ## lm(formula = rating ~ complaints + learning, data = attitude) ## ## Residuals: ## Min 1Q Median 3Q Max ## -11.5568 -5.7331 0.6701 6.5341 10.3610 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 9.8709 7.0612 1.398 0.174 ## complaints 0.6435 0.1185 5.432 9.57e-06 *** ## learning 0.2112 0.1344 1.571 0.128 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 6.817 on 27 degrees of freedom ## Multiple R-squared: 0.708, Adjusted R-squared: 0.6864 ## F-statistic: 32.74 on 2 and 27 DF, p-value: 6.058e-08# learning의 t-test p-value 값을 보면 유의하지 않다. # 하지만 rating과 상관관계가 없는 것이 아니다. # complaints 와의 상관관계도 있기 때문에 rating 변수에 대한 역할이 작아보일 뿐이다.# 모형 선택법 (Model Selection) = 설명변수 선택# *** 해당 업무분야에서 반드시 들어가야 하는 변수는 고정 !!! # (1) Forward selection # --- 가장 유의한 변수부터 하나씩 추가 (R-sq 기준) # --- 변수값의 작은 변동에도 결과가 크게 달라져 안정성 부족 # (2) Backward selection # --- 모든 변수를 넣고 가장 기여도가 낮은 것부터 하나씩 제거 # --- 전체 변수 정보를 이용하는 장점 # --- 변수의 갯수가 많은 경우 사용 어려움. 안정성 부족. # (3) Stepwise selection # --- Forward selection과 backward selection을 조합 # --- 새로운 변수 추가 후에 기존 변수의 중요도가 약화되면 그 변수 제거 # (4) All Subsets Regression # --- 모든 가능한 모형을 비교하여 최적의 모형선택 # --- 여러 모형 중 최소 AIC, BIC, Mallow’s Cp 또는 최대 adjusted R-sq를 갖는 모형을 선택 # --- 모형의 복잡도에 벌점을 주는 방법. AIC (Akaike information criterion), BIC (Bayesian ...)# Backward selection out <- lm(rating ~ ., attitude) summary(out) ## ## Call: ## lm(formula = rating ~ ., data = attitude) ## ## Residuals: ## Min 1Q Median 3Q Max ## -10.9418 -4.3555 0.3158 5.5425 11.5990 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 10.78708 11.58926 0.931 0.361634 ## complaints 0.61319 0.16098 3.809 0.000903 *** ## privileges -0.07305 0.13572 -0.538 0.595594 ## learning 0.32033 0.16852 1.901 0.069925 . ## raises 0.08173 0.22148 0.369 0.715480 ## critical 0.03838 0.14700 0.261 0.796334 ## advance -0.21706 0.17821 -1.218 0.235577 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 7.068 on 23 degrees of freedom ## Multiple R-squared: 0.7326, Adjusted R-squared: 0.6628 ## F-statistic: 10.5 on 6 and 23 DF, p-value: 1.24e-05anova(out) # 각 회귀계수 t검정 p-value 기준 선별. critical 제거.## Analysis of Variance Table ## ## Response: rating ## Df Sum Sq Mean Sq F value Pr(>F) ## complaints 1 2927.58 2927.58 58.6026 9.056e-08 *** ## privileges 1 7.52 7.52 0.1505 0.7016 ## learning 1 137.25 137.25 2.7473 0.1110 ## raises 1 0.94 0.94 0.0189 0.8920 ## critical 1 0.56 0.56 0.0113 0.9163 ## advance 1 74.11 74.11 1.4835 0.2356 ## Residuals 23 1149.00 49.96 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1out2 <- lm(rating ~ . - critical, data = attitude) summary(out2)## ## Call: ## lm(formula = rating ~ . - critical, data = attitude) ## ## Residuals: ## Min 1Q Median 3Q Max ## -11.8088 -4.8353 0.4199 5.5775 11.5276 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 12.79791 8.49061 1.507 0.144785 ## complaints 0.61315 0.15783 3.885 0.000704 *** ## privileges -0.07224 0.13303 -0.543 0.592122 ## learning 0.31172 0.16202 1.924 0.066300 . ## raises 0.09795 0.20842 0.470 0.642621 ## advance -0.21111 0.17328 -1.218 0.234956 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 6.929 on 24 degrees of freedom ## Multiple R-squared: 0.7318, Adjusted R-squared: 0.6759 ## F-statistic: 13.1 on 5 and 24 DF, p-value: 3.278e-06anova(out2) # raises 제거## Analysis of Variance Table ## ## Response: rating ## Df Sum Sq Mean Sq F value Pr(>F) ## complaints 1 2927.58 2927.58 60.9698 4.835e-08 *** ## privileges 1 7.52 7.52 0.1566 0.6958 ## learning 1 137.25 137.25 2.8583 0.1039 ## raises 1 0.94 0.94 0.0196 0.8898 ## advance 1 71.27 71.27 1.4842 0.2350 ## Residuals 24 1152.41 48.02 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1# Backward selection 자동화 backward <- step(out, direction = "backward", trace = T)## Start: AIC=123.36 ## rating ~ complaints + privileges + learning + raises + critical + ## advance ## ## Df Sum of Sq RSS AIC ## - critical 1 3.41 1152.4 121.45 ## - raises 1 6.80 1155.8 121.54 ## - privileges 1 14.47 1163.5 121.74 ## - advance 1 74.11 1223.1 123.24 ## <none> 1149.0 123.36 ## - learning 1 180.50 1329.5 125.74 ## - complaints 1 724.80 1873.8 136.04 ## ## Step: AIC=121.45 ## rating ~ complaints + privileges + learning + raises + advance ## ## Df Sum of Sq RSS AIC ## - raises 1 10.61 1163.0 119.73 ## - privileges 1 14.16 1166.6 119.82 ## - advance 1 71.27 1223.7 121.25 ## <none> 1152.4 121.45 ## - learning 1 177.74 1330.1 123.75 ## - complaints 1 724.70 1877.1 134.09 ## ## Step: AIC=119.73 ## rating ~ complaints + privileges + learning + advance ## ## Df Sum of Sq RSS AIC ## - privileges 1 16.10 1179.1 118.14 ## - advance 1 61.60 1224.6 119.28 ## <none> 1163.0 119.73 ## - learning 1 197.03 1360.0 122.42 ## - complaints 1 1165.94 2328.9 138.56 ## ## Step: AIC=118.14 ## rating ~ complaints + learning + advance ## ## Df Sum of Sq RSS AIC ## - advance 1 75.54 1254.7 118.00 ## <none> 1179.1 118.14 ## - learning 1 186.12 1365.2 120.54 ## - complaints 1 1259.91 2439.0 137.94 ## ## Step: AIC=118 ## rating ~ complaints + learning ## ## Df Sum of Sq RSS AIC ## <none> 1254.7 118.00 ## - learning 1 114.73 1369.4 118.63 ## - complaints 1 1370.91 2625.6 138.16backward <- step(out, direction = "backward", trace = F) backward # 최종 선택된 회귀모형 : rating ~ complaints + learning## ## Call: ## lm(formula = rating ~ complaints + learning, data = attitude) ## ## Coefficients: ## (Intercept) complaints learning ## 9.8709 0.6435 0.2112backward$anova # critical, raises, privileges, advance 순으로 제거됨## Step Df Deviance Resid. Df Resid. Dev AIC ## 1 NA NA 23 1149.000 123.3635 ## 2 - critical 1 3.405864 24 1152.406 121.4523 ## 3 - raises 1 10.605443 25 1163.012 119.7271 ## 4 - privileges 1 16.097508 26 1179.109 118.1395 ## 5 - advance 1 75.539831 27 1254.649 118.0024# Stepwise selection both <- step(out, direction = "both", trace = F) both## ## Call: ## lm(formula = rating ~ complaints + learning, data = attitude) ## ## Coefficients: ## (Intercept) complaints learning ## 9.8709 0.6435 0.2112both$anova## Step Df Deviance Resid. Df Resid. Dev AIC ## 1 NA NA 23 1149.000 123.3635 ## 2 - critical 1 3.405864 24 1152.406 121.4523 ## 3 - raises 1 10.605443 25 1163.012 119.7271 ## 4 - privileges 1 16.097508 26 1179.109 118.1395 ## 5 - advance 1 75.539831 27 1254.649 118.0024# All Subsets Regression library(leaps) leap <- regsubsets(rating ~ ., attitude, nbest = 5) # size당 5개의 최적 모형 저장 summary(leap)## Subset selection object ## Call: regsubsets.formula(rating ~ ., attitude, nbest = 5) ## 6 Variables (and intercept) ## Forced in Forced out ## complaints FALSE FALSE ## privileges FALSE FALSE ## learning FALSE FALSE ## raises FALSE FALSE ## critical FALSE FALSE ## advance FALSE FALSE ## 5 subsets of each size up to 6 ## Selection Algorithm: exhaustive ## complaints privileges learning raises critical advance ## 1 ( 1 ) "*" " " " " " " " " " " ## 1 ( 2 ) " " " " "*" " " " " " " ## 1 ( 3 ) " " " " " " "*" " " " " ## 1 ( 4 ) " " "*" " " " " " " " " ## 1 ( 5 ) " " " " " " " " "*" " " ## 2 ( 1 ) "*" " " "*" " " " " " " ## 2 ( 2 ) "*" " " " " "*" " " " " ## 2 ( 3 ) "*" "*" " " " " " " " " ## 2 ( 4 ) "*" " " " " " " " " "*" ## 2 ( 5 ) "*" " " " " " " "*" " " ## 3 ( 1 ) "*" " " "*" " " " " "*" ## 3 ( 2 ) "*" "*" "*" " " " " " " ## 3 ( 3 ) "*" " " "*" "*" " " " " ## 3 ( 4 ) "*" " " "*" " " "*" " " ## 3 ( 5 ) "*" " " " " "*" " " "*" ## 4 ( 1 ) "*" "*" "*" " " " " "*" ## 4 ( 2 ) "*" " " "*" "*" " " "*" ## 4 ( 3 ) "*" " " "*" " " "*" "*" ## 4 ( 4 ) "*" "*" "*" "*" " " " " ## 4 ( 5 ) "*" "*" "*" " " "*" " " ## 5 ( 1 ) "*" "*" "*" "*" " " "*" ## 5 ( 2 ) "*" "*" "*" " " "*" "*" ## 5 ( 3 ) "*" " " "*" "*" "*" "*" ## 5 ( 4 ) "*" "*" "*" "*" "*" " " ## 5 ( 5 ) "*" "*" " " "*" "*" "*" ## 6 ( 1 ) "*" "*" "*" "*" "*" "*"plot(leap)plot(leap, scale = "adjr2") # adjusted r-squred 기준# practice 5# hotel margin prediction data <- read.csv("data/laquinta.csv") summary(data)## Margin Number Nearest Office.Space ## Min. :27.30 Min. :1613 Min. :0.100 Min. :140.0 ## 1st Qu.:40.15 1st Qu.:2729 1st Qu.:1.675 1st Qu.:391.5 ## Median :46.00 Median :2934 Median :2.250 Median :486.5 ## Mean :45.74 Mean :2985 Mean :2.310 Mean :492.2 ## 3rd Qu.:51.62 3rd Qu.:3269 3rd Qu.:2.925 3rd Qu.:588.0 ## Max. :62.80 Max. :4214 Max. :4.200 Max. :875.0 ## Enrollment Income Distance ## Min. : 6.00 Min. :28.00 Min. : 0.200 ## 1st Qu.:13.38 1st Qu.:33.00 1st Qu.: 4.550 ## Median :16.00 Median :36.00 Median : 7.350 ## Mean :16.07 Mean :36.22 Mean : 6.918 ## 3rd Qu.:19.50 3rd Qu.:39.00 3rd Qu.: 9.025 ## Max. :26.50 Max. :46.00 Max. :14.400str(data)## 'data.frame': 100 obs. of 7 variables: ## $ Margin : num 55.5 33.8 49 31.9 57.4 49 46 50.2 46 45.5 ... ## $ Number : int 3203 2810 2890 3422 2687 3759 2341 3021 2655 2691 ... ## $ Nearest : num 4.2 2.8 2.4 3.3 0.9 2.9 2.3 1.7 1.1 3.2 ... ## $ Office.Space: int 549 496 254 434 678 635 580 572 666 519 ... ## $ Enrollment : num 8 17.5 20 15.5 15.5 19 23 8.5 22 13.5 ... ## $ Income : int 37 35 35 38 42 33 29 41 34 46 ... ## $ Distance : num 2.7 14.4 2.6 12.1 6.9 10.8 7.4 5.5 8.1 5.7 ...# 자료의 산점도 확인 round( cor(data), 3)## Margin Number Nearest Office.Space Enrollment Income Distance ## Margin 1.000 -0.470 0.160 0.501 0.123 0.248 -0.092 ## Number -0.470 1.000 0.082 -0.093 -0.064 0.037 0.073 ## Nearest 0.160 0.082 1.000 0.043 0.071 -0.045 0.091 ## Office.Space 0.501 -0.093 0.043 1.000 -0.001 0.153 0.033 ## Enrollment 0.123 -0.064 0.071 -0.001 1.000 -0.113 0.097 ## Income 0.248 0.037 -0.045 0.153 -0.113 1.000 -0.052 ## Distance -0.092 0.073 0.091 0.033 0.097 -0.052 1.000pairs.panels(data) # 설명변수간의 correlation도 낮다. 종속변수와도 낮다.# 회귀모형 model <- lm(Margin ~ ., data) summary(model) # F-test 유의함. R-squared: 0.525. Distance, Enrollment 제외한 회귀계수 유의함.## ## Call: ## lm(formula = Margin ~ ., data = data) ## ## Residuals: ## Min 1Q Median 3Q Max ## -12.267 -3.022 -0.086 4.234 13.596 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 38.138575 6.992948 5.454 4.04e-07 *** ## Number -0.007618 0.001255 -6.069 2.77e-08 *** ## Nearest 1.646237 0.632837 2.601 0.0108 * ## Office.Space 0.019766 0.003410 5.796 9.24e-08 *** ## Enrollment 0.211783 0.133428 1.587 0.1159 ## Income 0.413122 0.139552 2.960 0.0039 ** ## Distance -0.225258 0.178709 -1.260 0.2107 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 5.512 on 93 degrees of freedom ## Multiple R-squared: 0.5251, Adjusted R-squared: 0.4944 ## F-statistic: 17.14 on 6 and 93 DF, p-value: 3.034e-13plot(model) # 잔차도 이상 없음backward <- step(model, direction = "backward", trace = F) backward## ## Call: ## lm(formula = Margin ~ Number + Nearest + Office.Space + Enrollment + ## Income, data = data) ## ## Coefficients: ## (Intercept) Number Nearest Office.Space Enrollment ## 37.128891 -0.007742 1.586923 0.019576 0.196385 ## Income ## 0.421411both <- step(model, direction = "both", trace = F) both## ## Call: ## lm(formula = Margin ~ Number + Nearest + Office.Space + Enrollment + ## Income, data = data) ## ## Coefficients: ## (Intercept) Number Nearest Office.Space Enrollment ## 37.128891 -0.007742 1.586923 0.019576 0.196385 ## Income ## 0.421411 # 최종 회귀모형 : Margin ~ Number + Nearest + Office.Space + Enrollment + Income # Coefficients: # (Intercept) Number Nearest Office.Space Enrollment Income # 37.128891 -0.007742 1.586923 0.019576 0.196385 0.421411 # 다음 조건을 가진 한 지역의 Margin을 95% 신뢰구간으로 예측 new <- data.frame("Number" = 3815, "Nearest" = 0.9, "Office.Space" = 476, "Enrollment" = 24.5, "Income" = 35, "Distance" = 11.2) new## Number Nearest Office.Space Enrollment Income Distance ## 1 3815 0.9 476 24.5 35 11.2predict(model, new, interval = "prediction")## fit lwr upr ## 1 37.09149 25.39525 48.78772# BIC 값을 최소로 하는 설명변수의 조합을 찾아 회귀식을 추정 regsub <- regsubsets(Margin ~ ., data, nbest = 5) plot(regsub) # 최종 회귀모형 : Margin ~ Number + Nearest + Office.Space + Incomeplot(regsub, scale="adjr2")

Toplist

최신 우편물

태그