library(nnet) # Multinomial logistische Regression
library(car) # Modellvergleiche
library(MASS) # Modellvereinfachung
library(caret) # Kreuzvalidierung
# Datensatz 'data': pro Versuchsperson eine Zeile
library(MASS)
data <- housing
countsToCases <- function(x, countcol = "Freq") {
idx <- rep.int(seq_len(nrow(x)), x[[countcol]]); x[[countcol]] <- NULL; x[idx, ]}
data <- as.data.frame(countsToCases(data))
# Modell spezifizieren
model <- Sat~Infl+Type+Cont
variablen <- all.vars(model)
data2 <- na.omit(data[, variablen, drop=FALSE])
data2[,variablen[1]] <- as.factor(data2[,variablen[1]])
# Modellschätzung, Modellparameter
library(nnet); library(car)
variablen <- all.vars(model)
data2[,variablen[1]] <- as.factor(data2[,variablen[1]])
m <- multinom(model, data=data2)
Modellparameter <- summary(m, Wald=TRUE)
z <- Modellparameter$coefficients/Modellparameter$standard.errors
p <- (1 - pnorm(abs(z), 0, 1)) * 2
# Vergleich mit Nullmodell
model0 <- as.formula(paste(variablen[1], "~1", sep=""))
m0<-multinom(model0, data=data2)
m.vs.m0 <-anova(m, m0, test="Chisq")
# Wichtigkeit der Prädiktoren/Faktoren
Prädiktoren <- Anova(m)
# Pseudo R2
n <- nrow(na.omit(data2[,variablen]))
Cox.and.Snell <- as.numeric(1-exp((2/n)*(logLik(m0)-logLik(m))))
Negelkerke <- as.numeric((1-exp((2/n)*(logLik(m0)-logLik(m))))/(1-exp(logLik(m0)*(2/n))))
McFadden <- as.numeric(1-(logLik(m)/logLik(m0)))
R2 <- data.frame(Cox.and.Snell,Negelkerke,McFadden)
# Klassifikation
if (ncol(m$fitted.values)==1){
fitted <- cbind(1-m$fitted.values, m$fitted.values)
colnames(fitted) <- levels(data[,variablen[1]])
} else {
fitted <- m$fitted.values
}
pr <- apply(fitted, 1, function(x) names(which(x==max(x))))
pr <- factor(pr, levels=levels(data[,variablen[1]]))
klass.tab <- xtabs(~ data2[,variablen[1]] + pr)
# Nach Zufall zu erwartender Anteil richtiger Klassifikationen
Zufall <- as.numeric(rowSums(klass.tab) %*% rowSums(klass.tab)/sum( (klass.tab))^2)
# Anteil richtig klassifiziert
ant.ri.kl <- sum(diag(klass.tab))/sum(klass.tab) # insgesamt
Prozent.korrekt <- diag(klass.tab)/apply(klass.tab,1,sum)*100 # pro Zeile
klass.tab <-cbind(klass.tab, "korrekt in %"=Prozent.korrekt)
names(dimnames(klass.tab)) <- c("beobachtet", "vorhergesagt")
Ergebnis
# Output
list("Test für das Gesamtmodell"=m.vs.m0, Modellparameter=Modellparameter, "p-Werte der Modellparameter"=p, "Odds Ratios: exp(B)"=exp(Modellparameter$coefficients), "Tests pro Prädiktor/Faktor"=Prädiktoren, "Pseudo R2"=R2, Klassifikation=klass.tab, "Anteil richtig klassifizierter Fälle"=ant.ri.kl, "Nach Zufall zu erwartender Anteil richtig klassifizierter Fälle" = Zufall)
## $`Test für das Gesamtmodell`
## Likelihood ratio tests of Multinomial Models
##
## Response: Sat
## Model Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 1 3360 3648.878
## 2 Infl + Type + Cont 3348 3470.084 1 vs 2 12 178.7938 0
##
## $Modellparameter
## Call:
## multinom(formula = model, data = data2)
##
## Coefficients:
## (Intercept) InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium -0.4192316 0.4464003 0.6649367 -0.4356851 0.1313663 -0.6665728
## High -0.1387453 0.7348626 1.6126294 -0.7356261 -0.4079808 -1.4123333
## ContHigh
## Medium 0.3608513
## High 0.4818236
##
## Std. Errors:
## (Intercept) InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium 0.1729344 0.1415572 0.1863374 0.1725327 0.2231065 0.2062532
## High 0.1592295 0.1369380 0.1671316 0.1552714 0.2114965 0.2001496
## ContHigh
## Medium 0.1323975
## High 0.1241371
##
## Value/SE (Wald statistics):
## (Intercept) InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium -2.4242225 3.153497 3.568456 -2.525232 0.5888056 -3.231819
## High -0.8713538 5.366388 9.648858 -4.737680 -1.9290196 -7.056389
## ContHigh
## Medium 2.725515
## High 3.881384
##
## Residual Deviance: 3470.084
## AIC: 3498.084
##
## $`p-Werte der Modellparameter`
## (Intercept) InflMedium InflHigh TypeApartment TypeAtrium
## Medium 0.0153412 1.613271e-03 0.0003590916 1.156220e-02 0.55599171
## High 0.3835610 8.032883e-08 0.0000000000 2.161789e-06 0.05372843
## TypeTerrace ContHigh
## Medium 1.230051e-03 0.0064201217
## High 1.708855e-12 0.0001038636
##
## $`Odds Ratios: exp(B)`
## (Intercept) InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium 0.6575519 1.562677 1.944367 0.6468214 1.1403855 0.5134653
## High 0.8704497 2.085195 5.015983 0.4792053 0.6649916 0.2435743
## ContHigh
## Medium 1.434550
## High 1.619024
##
## $`Tests pro Prädiktor/Faktor`
## Analysis of Deviance Table (Type II tests)
##
## Response: Sat
## LR Chisq Df Pr(>Chisq)
## Infl 109.117 4 < 2.2e-16 ***
## Type 62.227 6 1.586e-11 ***
## Cont 16.060 2 0.0003256 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $`Pseudo R2`
## Cox.and.Snell Negelkerke McFadden
## 1 0.1009005 0.1138963 0.04899966
##
## $Klassifikation
## vorhergesagt
## beobachtet Low Medium High korrekt in %
## Low 337 20 210 59.435626
## Medium 197 23 226 5.156951
## High 184 20 464 69.461078
##
## $`Anteil richtig klassifizierter Fälle`
## [1] 0.4901844
##
## $`Nach Zufall zu erwartender Anteil richtig klassifizierter Fälle`
## [1] 0.3420774
library(MASS)
stepAIC(m, direction="both")
## Start: AIC=3498.08
## Sat ~ Infl + Type + Cont
##
## # weights: 18 (10 variable)
## initial value 1846.767257
## iter 10 value 1793.932058
## final value 1789.600661
## converged
## # weights: 15 (8 variable)
## initial value 1846.767257
## iter 10 value 1768.347114
## final value 1766.155394
## converged
## # weights: 21 (12 variable)
## initial value 1846.767257
## iter 10 value 1747.742032
## final value 1743.071799
## converged
## Df AIC
## <none> 3498.1
## - Cont 2 3510.1
## - Type 6 3548.3
## - Infl 4 3599.2
## Call:
## multinom(formula = Sat ~ Infl + Type + Cont, data = data2)
##
## Coefficients:
## (Intercept) InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium -0.4192316 0.4464003 0.6649367 -0.4356851 0.1313663 -0.6665728
## High -0.1387453 0.7348626 1.6126294 -0.7356261 -0.4079808 -1.4123333
## ContHigh
## Medium 0.3608513
## High 0.4818236
##
## Residual Deviance: 3470.084
## AIC: 3498.084
k <- 10 # k eingeben
library(caret)
k.folds <- function(k) {
accuracies.dt <- numeric()
folds <- createFolds(data2[,variablen[1]], k = k, list = TRUE, returnTrain = TRUE)
for (i in 1:k) {
model2 <- multinom(model, data = data2[folds[[i]],], method = "class")
predictions <- predict(object = model2, newdata = data2[-folds[[i]],], type = "class")
accuracies.dt <- c(accuracies.dt,
confusionMatrix(predictions, data2[-folds[[i]], variablen[1]])$overall[[1]])
}
accuracies.dt
}
k.fold <- k.folds(k)
Ergebnis
list("Anteil richtig klassifizierter Fälle pro fold"=k.fold, "Mittelwert"=mean(k.fold))
## $`Anteil richtig klassifizierter Fälle pro fold`
## [1] 0.5088757 0.4821429 0.4226190 0.4939759 0.4378698 0.4880952 0.4642857
## [8] 0.5178571 0.4702381 0.5502959
##
## $Mittelwert
## [1] 0.4836255