讀取資料

data <- read.csv("/Users/momo/Desktop/R\ /titanic.csv", header=T)
head(data)
##                                  name gender age class fare group
## 1        ALLEN, Miss Elisabeth Walton      1  29     1  211      
## 2 ALLISON, Mr Hudson Joshua Creighton      0  30     1  151      
## 3           ALLISON, Mrs Bessie Waldo      1  25     1  151      
## 4         ALLISON, Miss Helen Loraine      1   2     1  151      
## 5       ALLISON, Master Hudson Trevor      0   1     1  151      
## 6                  ANDERSON, Mr Harry      0  47     1   26      
##        joined         job boat survival
## 1 Southampton                2        1
## 2 Southampton Businessman             0
## 3 Southampton                         0
## 4 Southampton                         0
## 5 Southampton               11        1
## 6 Southampton Stockbroker    3        1

整理資料

將乘客及員工的資料分開,以方便後續討論 並剔除資料中為9999的值

passenger <- data[data$class %in% c(1,2,3),]
crew <- data[!data$class %in% c(1,2,3),]

passenger <- passenger[!passenger$fare == 9999,]
passenger <- passenger[!passenger$age == 9999,]
crew <- crew[!crew$age == 9999,]

head(passenger)
##                                  name gender age class fare group
## 1        ALLEN, Miss Elisabeth Walton      1  29     1  211      
## 2 ALLISON, Mr Hudson Joshua Creighton      0  30     1  151      
## 3           ALLISON, Mrs Bessie Waldo      1  25     1  151      
## 4         ALLISON, Miss Helen Loraine      1   2     1  151      
## 5       ALLISON, Master Hudson Trevor      0   1     1  151      
## 6                  ANDERSON, Mr Harry      0  47     1   26      
##        joined         job boat survival
## 1 Southampton                2        1
## 2 Southampton Businessman             0
## 3 Southampton                         0
## 4 Southampton                         0
## 5 Southampton               11        1
## 6 Southampton Stockbroker    3        1
head(crew)
##                           name gender age class fare group      joined
## 1318        ANDERSON, Mr James      0  40  Deck   NA       Southampton
## 1319  ARCHER, Mr Ernest Edward      0  36  Deck   NA       Southampton
## 1320   BAILEY, Mr Henry Joseph      0  43  Deck   NA       Southampton
## 1321 BOXHALL, Mr Joseph Groves      0  28  Deck   NA           Belfast
## 1322            BRADLEY, Mr T.      0  29  Deck   NA       Southampton
## 1323        BRICE, Mr Walter T      0  42  Deck   NA       Southampton
##                 job boat survival
## 1318    Able Seaman    3        1
## 1319    Able Seaman   16        1
## 1320 Master-at-arms   16        1
## 1321   4th. Officer    2        1
## 1322    Able Seaman             0
## 1323    Able Seaman   11        1

船員及乘客的基本統計

#乘客年齡分佈
summary(passenger$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   21.00   27.00   29.35   37.00   74.00
ggplot(data = passenger, aes(x = passenger$age)) + 
  geom_bar(colour = "black")

#船員年齡分佈
summary(crew$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   24.00   30.00   30.65   36.00   62.00
ggplot(data = crew, aes(x = crew$age)) + 
  geom_bar(colour = "black")

#乘客性別分佈
table(passenger$gender)
## 
##   0   1 
## 826 459
ggplot(data = passenger, aes(x = passenger$gender)) + 
  geom_bar(fill = "blue")

#船員性別分佈
table(crew$gender) 
## 
##   0   1 
## 864  23
ggplot(data = crew, aes(x = crew$gender)) + 
  geom_bar(fill = "red")

#問題一 老小與青壯年存活率比較

因船員年齡較為集中,此處僅用乘客資料

young <- select(passenger, survival, age) %>%
  filter(age < 18)

table(young)
##         age
## survival  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
##        0  5  8  3  3  3  2  6  4  6  6  4  2  1  7  6 16 19
##        1 16  5  6 10  3  2  3  5  4  0  2  2  3  2  3  7  8
young_live <- sum(young$survival == 1)
young_dead <- sum(young$survival == 0)
young_all <- length(young$survival)
young_sur_rate <- young_live / young_all
print (young_sur_rate)
## [1] 0.4450549
normal <- select(passenger, survival, age) %>%
  filter(age >= 18 & age < 65)

table(normal)
##         age
## survival 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
##        0 28 29 35 35 39 28 31 28 24 26 37 24 31 18 17 15 13 16 18 10 12 11
##        1 16 13 10 16 22 12 24 15 15 15 11 17 11 16 15  7 11 12 18  5  7 15
##         age
## survival 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
##        0 15  9 22  6  9 10 10  8  6  4  8  4  4  1  6  5  2  5  3  5  4  4
##        1  5  3  5  3  5 15  4  6 12  5  4  2  5  2  4  3  3  0  1  5  3  1
##         age
## survival 62 63 64
##        0  2  6  3
##        1  3  1  2
normal_live <- sum(normal$survival == 1) 
normal_dead <- sum(normal$survival == 0) 
normal_all <- length(normal$survival) 
normal_sur_rate <- normal_live / normal_all
print (normal_sur_rate)
## [1] 0.3712191
old <- select(passenger, survival, age) %>%
  filter(age >= 65)

table(old)
##         age
## survival 65 66 67 69 70 71 74
##        0  2  3  1  1  1  3  1
old_live <- sum(old$survival == 1) 
old_dead <- sum(old$survival == 0) 
old_all <- length(old$survival) 
old_sur_rate <- old_live / old_all
print (old_sur_rate)
## [1] 0
Age <- data.frame(people = c("young", "normal", "old"),
            sur_rate = c(young_sur_rate, normal_sur_rate, old_sur_rate))
ggplot(data = Age, aes(x = people, y = sur_rate, fill = people)) + 
  geom_bar(stat = "identity")

未成年的存活率最高 -> 推測逃難時大家會先讓小孩上救生艇
老年人的存活率為0 -> 推測老年人可能體力較為不濟或身體狀況較差或較看淡生死

問題二 乘客艙房等級與存活率的關係

Class1 <- select(passenger, survival, class) %>%
  filter(class == 1)

Class1_live <- sum(Class1$survival == 1) 
Class1_dead <- sum(Class1$survival == 0) 
Class1_all <- length(Class1$survival) 
Class1_sur_rate <- Class1_live / Class1_all
print (Class1_sur_rate)
## [1] 0.6453674
Class2 <- select(passenger, survival, class) %>%
  filter(class == 2)
str(Class2)
## 'data.frame':    271 obs. of  2 variables:
##  $ survival: int  0 1 0 0 0 0 1 0 0 0 ...
##  $ class   : Factor w/ 7 levels "1","2","3","A la Carte",..: 2 2 2 2 2 2 2 2 2 2 ...
Class2_live <- sum(Class2$survival == 1) 
Class2_dead <- sum(Class2$survival == 0) 
Class2_all <- length(Class2$survival) 
Class2_sur_rate <- Class2_live / Class2_all
print (Class2_sur_rate)
## [1] 0.4132841
Class3 <- select(passenger, survival, class) %>%
  filter(class == 3)
str(Class3)
## 'data.frame':    701 obs. of  2 variables:
##  $ survival: int  0 1 0 0 1 1 1 1 0 0 ...
##  $ class   : Factor w/ 7 levels "1","2","3","A la Carte",..: 3 3 3 3 3 3 3 3 3 3 ...
Class3_live <- sum(Class3$survival == 1) 
Class3_dead <- sum(Class3$survival == 0) 
Class3_all <- length(Class3$survival) 
Class3_sur_rate <- Class3_live / Class3_all
print (Class3_sur_rate)
## [1] 0.2453638
CLASS <- data.frame(Class = c("Class1", "Class2", "Class3"),
                  sur_rate = c(Class1_sur_rate, Class2_sur_rate, Class3_sur_rate))
ggplot(data = CLASS, aes(x = Class, y = sur_rate, fill = Class)) + 
  geom_bar(stat = "identity")

艙房等級越高,存活率越大 -> 推測居住於較高級艙房的旅客擁有較優良的逃生路線與救生艇

問題三 是否獨自登船與存活率的關係

Name <- strsplit(as.character(passenger$name), ',', fixed = TRUE) %>%
  unlist() 
Name <- Name[seq(1,length(Name)-2,2)] 
head(Name)
## [1] "ALLEN"    "ALLISON"  "ALLISON"  "ALLISON"  "ALLISON"  "ANDERSON"
family <- names(table(Name)[table(Name) > 1])
passenger$lastname <- Name
head(passenger$lastname)
## [1] "ALLEN"    "ALLISON"  "ALLISON"  "ALLISON"  "ALLISON"  "ANDERSON"
passenger$single <- rep(1, length(passenger$lastname)) 
passenger$single[passenger$lastname %in% family] = 0

p_table <- table(passenger$survival, passenger$single)
p_table
##    
##       0   1
##   0 337 462
##   1 247 239
family_sur_rate <- p_table[2,1] / (p_table[1,1] + p_table[2,1])
single_sur_rate <- p_table[2,2] / (p_table[1,2] + p_table[2,2])
print (family_sur_rate)
## [1] 0.4229452
print (single_sur_rate)
## [1] 0.3409415
fam_or_sin <- data.frame(status = c("family", "single"), 
                         sur_rate = c(family_sur_rate, single_sur_rate))

ggplot(data = fam_or_sin, aes(status, sur_rate, fill = status)) + 
  geom_bar(sta = "identity")

較家庭共同登船之乘客存活率較獨自登船之乘客稍高 -> 推測家庭成員會想辦法讓小孩能坐上救生艇,獨自登船的旅客也較易先幫助他人逃生

問題四 救生艇承載人數與登船旅客之艙級比較

tmp <- data %>%
  group_by(boat) %>%
  filter(boat %in% c(1:16, "A", "B", "C", "D")) %>%
  summarise(boat_cnt = n()) %>%
  arrange(desc(boat_cnt))
tmp
## # A tibble: 20 x 2
##      boat boat_cnt
##    <fctr>    <int>
##  1     13       50
##  2     15       43
##  3      C       42
##  4     14       38
##  5      4       38
##  6     11       37
##  7      5       33
##  8     10       31
##  9      3       31
## 10     16       30
## 11      9       30
## 12      8       27
## 13      7       26
## 14      6       23
## 15      D       22
## 16     12       19
## 17      2       17
## 18      B       15
## 19      A       13
## 20      1       12
ggplot(data = tmp, aes(boat, boat_cnt, fill = boat)) + 
  geom_bar(sta = "identity")

pie <- ggplot(tmp, aes(x=factor(1), y = boat_cnt, fill=boat))+
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start=0)
pie

cmp <- passenger %>%
  filter(boat %in% c(1:16, "A", "B", "C", "D")) %>%
  group_by(class, boat) %>%
  summarise(boat_cnt = n()) #%>%
  ##arrange(desc(boat_cnt))

ggplot(cmp, aes(boat, fill = class)) +
  geom_bar(position = "fill")

由圖表可看出Class1的旅客擁有較多搭乘救生艇的機會 -> 推論有錢人受到的待遇比較好
然而相較於Class2,Class3反而擁有較多搭乘救生艇的機會,和問題二的圖表相比較後,卻發現Class3的存活率比Class2低得多 -> 推論是由於Class3的人數較Class2的人數多很多,以至於在計算的時候比例會較高