Erforderliche Pakete laden

library(nnet)   # Multinomial logistische Regression
library(car)    # Modellvergleiche
library(MASS)   # Modellvereinfachung
library(caret)  # Kreuzvalidierung

Datensatz einlesen und Modell spezifizieren

# Datensatz 'data': pro Versuchsperson eine Zeile
library(MASS)
data <- housing
countsToCases <- function(x, countcol = "Freq") {
idx <- rep.int(seq_len(nrow(x)), x[[countcol]]); x[[countcol]] <- NULL; x[idx, ]}
data <- as.data.frame(countsToCases(data))

# Modell spezifizieren
model <- Sat~Infl+Type+Cont

Multinomiale logistische Regression

variablen <- all.vars(model)
data2 <- na.omit(data[, variablen, drop=FALSE])
data2[,variablen[1]] <- as.factor(data2[,variablen[1]])

# Modellschätzung, Modellparameter
library(nnet); library(car)
variablen <- all.vars(model)
data2[,variablen[1]] <- as.factor(data2[,variablen[1]])
m <- multinom(model, data=data2)
Modellparameter <- summary(m, Wald=TRUE)
z <- Modellparameter$coefficients/Modellparameter$standard.errors
p <- (1 - pnorm(abs(z), 0, 1)) * 2

# Vergleich mit Nullmodell
model0 <- as.formula(paste(variablen[1], "~1", sep=""))
m0<-multinom(model0, data=data2)
m.vs.m0 <-anova(m, m0, test="Chisq")

# Wichtigkeit der Prädiktoren/Faktoren
Prädiktoren <- Anova(m)

# Pseudo R2
n <- nrow(na.omit(data2[,variablen]))
Cox.and.Snell <- as.numeric(1-exp((2/n)*(logLik(m0)-logLik(m))))
Negelkerke <- as.numeric((1-exp((2/n)*(logLik(m0)-logLik(m))))/(1-exp(logLik(m0)*(2/n))))
McFadden <- as.numeric(1-(logLik(m)/logLik(m0)))
R2 <- data.frame(Cox.and.Snell,Negelkerke,McFadden)

# Klassifikation
if (ncol(m$fitted.values)==1){
  fitted <- cbind(1-m$fitted.values, m$fitted.values)
  colnames(fitted) <- levels(data[,variablen[1]])
} else {
  fitted <- m$fitted.values
}
pr <- apply(fitted, 1, function(x) names(which(x==max(x))))
pr <- factor(pr, levels=levels(data[,variablen[1]]))
klass.tab <- xtabs(~ data2[,variablen[1]] + pr)
# Nach Zufall zu erwartender Anteil richtiger Klassifikationen
Zufall <- as.numeric(rowSums(klass.tab) %*% rowSums(klass.tab)/sum( (klass.tab))^2)
# Anteil richtig klassifiziert
ant.ri.kl <- sum(diag(klass.tab))/sum(klass.tab) # insgesamt
Prozent.korrekt <- diag(klass.tab)/apply(klass.tab,1,sum)*100 # pro Zeile
klass.tab  <-cbind(klass.tab, "korrekt in %"=Prozent.korrekt)
names(dimnames(klass.tab)) <- c("beobachtet", "vorhergesagt")

Ergebnis

# Output
list("Test für das Gesamtmodell"=m.vs.m0, Modellparameter=Modellparameter, "p-Werte der Modellparameter"=p, "Odds Ratios: exp(B)"=exp(Modellparameter$coefficients), "Tests pro Prädiktor/Faktor"=Prädiktoren, "Pseudo R2"=R2, Klassifikation=klass.tab, "Anteil richtig klassifizierter Fälle"=ant.ri.kl, "Nach Zufall zu erwartender Anteil richtig klassifizierter Fälle" = Zufall)
## $`Test für das Gesamtmodell`
## Likelihood ratio tests of Multinomial Models
## 
## Response: Sat
##                Model Resid. df Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1                  1      3360   3648.878                              
## 2 Infl + Type + Cont      3348   3470.084 1 vs 2    12 178.7938       0
## 
## $Modellparameter
## Call:
## multinom(formula = model, data = data2)
## 
## Coefficients:
##        (Intercept) InflMedium  InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium  -0.4192316  0.4464003 0.6649367    -0.4356851  0.1313663  -0.6665728
## High    -0.1387453  0.7348626 1.6126294    -0.7356261 -0.4079808  -1.4123333
##         ContHigh
## Medium 0.3608513
## High   0.4818236
## 
## Std. Errors:
##        (Intercept) InflMedium  InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium   0.1729344  0.1415572 0.1863374     0.1725327  0.2231065   0.2062532
## High     0.1592295  0.1369380 0.1671316     0.1552714  0.2114965   0.2001496
##         ContHigh
## Medium 0.1323975
## High   0.1241371
## 
## Value/SE (Wald statistics):
##        (Intercept) InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium  -2.4242225   3.153497 3.568456     -2.525232  0.5888056   -3.231819
## High    -0.8713538   5.366388 9.648858     -4.737680 -1.9290196   -7.056389
##        ContHigh
## Medium 2.725515
## High   3.881384
## 
## Residual Deviance: 3470.084 
## AIC: 3498.084 
## 
## $`p-Werte der Modellparameter`
##        (Intercept)   InflMedium     InflHigh TypeApartment TypeAtrium
## Medium   0.0153412 1.613271e-03 0.0003590916  1.156220e-02 0.55599171
## High     0.3835610 8.032883e-08 0.0000000000  2.161789e-06 0.05372843
##         TypeTerrace     ContHigh
## Medium 1.230051e-03 0.0064201217
## High   1.708855e-12 0.0001038636
## 
## $`Odds Ratios: exp(B)`
##        (Intercept) InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium   0.6575519   1.562677 1.944367     0.6468214  1.1403855   0.5134653
## High     0.8704497   2.085195 5.015983     0.4792053  0.6649916   0.2435743
##        ContHigh
## Medium 1.434550
## High   1.619024
## 
## $`Tests pro Prädiktor/Faktor`
## Analysis of Deviance Table (Type II tests)
## 
## Response: Sat
##      LR Chisq Df Pr(>Chisq)    
## Infl  109.117  4  < 2.2e-16 ***
## Type   62.227  6  1.586e-11 ***
## Cont   16.060  2  0.0003256 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Pseudo R2`
##   Cox.and.Snell Negelkerke   McFadden
## 1     0.1009005  0.1138963 0.04899966
## 
## $Klassifikation
##           vorhergesagt
## beobachtet Low Medium High korrekt in %
##     Low    337     20  210    59.435626
##     Medium 197     23  226     5.156951
##     High   184     20  464    69.461078
## 
## $`Anteil richtig klassifizierter Fälle`
## [1] 0.4901844
## 
## $`Nach Zufall zu erwartender Anteil richtig klassifizierter Fälle`
## [1] 0.3420774

Modellvereinfachung

library(MASS)
stepAIC(m, direction="both")
## Start:  AIC=3498.08
## Sat ~ Infl + Type + Cont
## 
## # weights:  18 (10 variable)
## initial  value 1846.767257 
## iter  10 value 1793.932058
## final  value 1789.600661 
## converged
## # weights:  15 (8 variable)
## initial  value 1846.767257 
## iter  10 value 1768.347114
## final  value 1766.155394 
## converged
## # weights:  21 (12 variable)
## initial  value 1846.767257 
## iter  10 value 1747.742032
## final  value 1743.071799 
## converged
##        Df    AIC
## <none>    3498.1
## - Cont  2 3510.1
## - Type  6 3548.3
## - Infl  4 3599.2
## Call:
## multinom(formula = Sat ~ Infl + Type + Cont, data = data2)
## 
## Coefficients:
##        (Intercept) InflMedium  InflHigh TypeApartment TypeAtrium TypeTerrace
## Medium  -0.4192316  0.4464003 0.6649367    -0.4356851  0.1313663  -0.6665728
## High    -0.1387453  0.7348626 1.6126294    -0.7356261 -0.4079808  -1.4123333
##         ContHigh
## Medium 0.3608513
## High   0.4818236
## 
## Residual Deviance: 3470.084 
## AIC: 3498.084

k-fold Cross Validation

k <- 10     # k eingeben
library(caret)
k.folds <- function(k) {
      accuracies.dt <- numeric()
      folds <- createFolds(data2[,variablen[1]], k = k, list = TRUE, returnTrain = TRUE)
      for (i in 1:k) {
        model2 <- multinom(model, data = data2[folds[[i]],], method = "class")
        predictions <- predict(object = model2, newdata = data2[-folds[[i]],], type = "class")
        accuracies.dt <- c(accuracies.dt, 
        confusionMatrix(predictions, data2[-folds[[i]], variablen[1]])$overall[[1]])
      }
      accuracies.dt
}
k.fold <- k.folds(k)

Ergebnis

list("Anteil richtig klassifizierter Fälle pro fold"=k.fold, "Mittelwert"=mean(k.fold))
## $`Anteil richtig klassifizierter Fälle pro fold`
##  [1] 0.5088757 0.4821429 0.4226190 0.4939759 0.4378698 0.4880952 0.4642857
##  [8] 0.5178571 0.4702381 0.5502959
## 
## $Mittelwert
## [1] 0.4836255