data <- read.csv("/Users/momo/Desktop/R\ /titanic.csv", header=T)
head(data)
## name gender age class fare group
## 1 ALLEN, Miss Elisabeth Walton 1 29 1 211
## 2 ALLISON, Mr Hudson Joshua Creighton 0 30 1 151
## 3 ALLISON, Mrs Bessie Waldo 1 25 1 151
## 4 ALLISON, Miss Helen Loraine 1 2 1 151
## 5 ALLISON, Master Hudson Trevor 0 1 1 151
## 6 ANDERSON, Mr Harry 0 47 1 26
## joined job boat survival
## 1 Southampton 2 1
## 2 Southampton Businessman 0
## 3 Southampton 0
## 4 Southampton 0
## 5 Southampton 11 1
## 6 Southampton Stockbroker 3 1
將乘客及員工的資料分開,以方便後續討論 並剔除資料中為9999的值
passenger <- data[data$class %in% c(1,2,3),]
crew <- data[!data$class %in% c(1,2,3),]
passenger <- passenger[!passenger$fare == 9999,]
passenger <- passenger[!passenger$age == 9999,]
crew <- crew[!crew$age == 9999,]
head(passenger)
## name gender age class fare group
## 1 ALLEN, Miss Elisabeth Walton 1 29 1 211
## 2 ALLISON, Mr Hudson Joshua Creighton 0 30 1 151
## 3 ALLISON, Mrs Bessie Waldo 1 25 1 151
## 4 ALLISON, Miss Helen Loraine 1 2 1 151
## 5 ALLISON, Master Hudson Trevor 0 1 1 151
## 6 ANDERSON, Mr Harry 0 47 1 26
## joined job boat survival
## 1 Southampton 2 1
## 2 Southampton Businessman 0
## 3 Southampton 0
## 4 Southampton 0
## 5 Southampton 11 1
## 6 Southampton Stockbroker 3 1
head(crew)
## name gender age class fare group joined
## 1318 ANDERSON, Mr James 0 40 Deck NA Southampton
## 1319 ARCHER, Mr Ernest Edward 0 36 Deck NA Southampton
## 1320 BAILEY, Mr Henry Joseph 0 43 Deck NA Southampton
## 1321 BOXHALL, Mr Joseph Groves 0 28 Deck NA Belfast
## 1322 BRADLEY, Mr T. 0 29 Deck NA Southampton
## 1323 BRICE, Mr Walter T 0 42 Deck NA Southampton
## job boat survival
## 1318 Able Seaman 3 1
## 1319 Able Seaman 16 1
## 1320 Master-at-arms 16 1
## 1321 4th. Officer 2 1
## 1322 Able Seaman 0
## 1323 Able Seaman 11 1
#乘客年齡分佈
summary(passenger$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 21.00 27.00 29.35 37.00 74.00
ggplot(data = passenger, aes(x = passenger$age)) +
geom_bar(colour = "black")
#船員年齡分佈
summary(crew$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 24.00 30.00 30.65 36.00 62.00
ggplot(data = crew, aes(x = crew$age)) +
geom_bar(colour = "black")
#乘客性別分佈
table(passenger$gender)
##
## 0 1
## 826 459
ggplot(data = passenger, aes(x = passenger$gender)) +
geom_bar(fill = "blue")
#船員性別分佈
table(crew$gender)
##
## 0 1
## 864 23
ggplot(data = crew, aes(x = crew$gender)) +
geom_bar(fill = "red")
#問題一 老小與青壯年存活率比較
因船員年齡較為集中,此處僅用乘客資料
young <- select(passenger, survival, age) %>%
filter(age < 18)
table(young)
## age
## survival 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 0 5 8 3 3 3 2 6 4 6 6 4 2 1 7 6 16 19
## 1 16 5 6 10 3 2 3 5 4 0 2 2 3 2 3 7 8
young_live <- sum(young$survival == 1)
young_dead <- sum(young$survival == 0)
young_all <- length(young$survival)
young_sur_rate <- young_live / young_all
print (young_sur_rate)
## [1] 0.4450549
normal <- select(passenger, survival, age) %>%
filter(age >= 18 & age < 65)
table(normal)
## age
## survival 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
## 0 28 29 35 35 39 28 31 28 24 26 37 24 31 18 17 15 13 16 18 10 12 11
## 1 16 13 10 16 22 12 24 15 15 15 11 17 11 16 15 7 11 12 18 5 7 15
## age
## survival 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
## 0 15 9 22 6 9 10 10 8 6 4 8 4 4 1 6 5 2 5 3 5 4 4
## 1 5 3 5 3 5 15 4 6 12 5 4 2 5 2 4 3 3 0 1 5 3 1
## age
## survival 62 63 64
## 0 2 6 3
## 1 3 1 2
normal_live <- sum(normal$survival == 1)
normal_dead <- sum(normal$survival == 0)
normal_all <- length(normal$survival)
normal_sur_rate <- normal_live / normal_all
print (normal_sur_rate)
## [1] 0.3712191
old <- select(passenger, survival, age) %>%
filter(age >= 65)
table(old)
## age
## survival 65 66 67 69 70 71 74
## 0 2 3 1 1 1 3 1
old_live <- sum(old$survival == 1)
old_dead <- sum(old$survival == 0)
old_all <- length(old$survival)
old_sur_rate <- old_live / old_all
print (old_sur_rate)
## [1] 0
Age <- data.frame(people = c("young", "normal", "old"),
sur_rate = c(young_sur_rate, normal_sur_rate, old_sur_rate))
ggplot(data = Age, aes(x = people, y = sur_rate, fill = people)) +
geom_bar(stat = "identity")
未成年的存活率最高 -> 推測逃難時大家會先讓小孩上救生艇
老年人的存活率為0 -> 推測老年人可能體力較為不濟或身體狀況較差或較看淡生死
Class1 <- select(passenger, survival, class) %>%
filter(class == 1)
Class1_live <- sum(Class1$survival == 1)
Class1_dead <- sum(Class1$survival == 0)
Class1_all <- length(Class1$survival)
Class1_sur_rate <- Class1_live / Class1_all
print (Class1_sur_rate)
## [1] 0.6453674
Class2 <- select(passenger, survival, class) %>%
filter(class == 2)
str(Class2)
## 'data.frame': 271 obs. of 2 variables:
## $ survival: int 0 1 0 0 0 0 1 0 0 0 ...
## $ class : Factor w/ 7 levels "1","2","3","A la Carte",..: 2 2 2 2 2 2 2 2 2 2 ...
Class2_live <- sum(Class2$survival == 1)
Class2_dead <- sum(Class2$survival == 0)
Class2_all <- length(Class2$survival)
Class2_sur_rate <- Class2_live / Class2_all
print (Class2_sur_rate)
## [1] 0.4132841
Class3 <- select(passenger, survival, class) %>%
filter(class == 3)
str(Class3)
## 'data.frame': 701 obs. of 2 variables:
## $ survival: int 0 1 0 0 1 1 1 1 0 0 ...
## $ class : Factor w/ 7 levels "1","2","3","A la Carte",..: 3 3 3 3 3 3 3 3 3 3 ...
Class3_live <- sum(Class3$survival == 1)
Class3_dead <- sum(Class3$survival == 0)
Class3_all <- length(Class3$survival)
Class3_sur_rate <- Class3_live / Class3_all
print (Class3_sur_rate)
## [1] 0.2453638
CLASS <- data.frame(Class = c("Class1", "Class2", "Class3"),
sur_rate = c(Class1_sur_rate, Class2_sur_rate, Class3_sur_rate))
ggplot(data = CLASS, aes(x = Class, y = sur_rate, fill = Class)) +
geom_bar(stat = "identity")
艙房等級越高,存活率越大 -> 推測居住於較高級艙房的旅客擁有較優良的逃生路線與救生艇
Name <- strsplit(as.character(passenger$name), ',', fixed = TRUE) %>%
unlist()
Name <- Name[seq(1,length(Name)-2,2)]
head(Name)
## [1] "ALLEN" "ALLISON" "ALLISON" "ALLISON" "ALLISON" "ANDERSON"
family <- names(table(Name)[table(Name) > 1])
passenger$lastname <- Name
head(passenger$lastname)
## [1] "ALLEN" "ALLISON" "ALLISON" "ALLISON" "ALLISON" "ANDERSON"
passenger$single <- rep(1, length(passenger$lastname))
passenger$single[passenger$lastname %in% family] = 0
p_table <- table(passenger$survival, passenger$single)
p_table
##
## 0 1
## 0 337 462
## 1 247 239
family_sur_rate <- p_table[2,1] / (p_table[1,1] + p_table[2,1])
single_sur_rate <- p_table[2,2] / (p_table[1,2] + p_table[2,2])
print (family_sur_rate)
## [1] 0.4229452
print (single_sur_rate)
## [1] 0.3409415
fam_or_sin <- data.frame(status = c("family", "single"),
sur_rate = c(family_sur_rate, single_sur_rate))
ggplot(data = fam_or_sin, aes(status, sur_rate, fill = status)) +
geom_bar(sta = "identity")
較家庭共同登船之乘客存活率較獨自登船之乘客稍高 -> 推測家庭成員會想辦法讓小孩能坐上救生艇,獨自登船的旅客也較易先幫助他人逃生
tmp <- data %>%
group_by(boat) %>%
filter(boat %in% c(1:16, "A", "B", "C", "D")) %>%
summarise(boat_cnt = n()) %>%
arrange(desc(boat_cnt))
tmp
## # A tibble: 20 x 2
## boat boat_cnt
## <fctr> <int>
## 1 13 50
## 2 15 43
## 3 C 42
## 4 14 38
## 5 4 38
## 6 11 37
## 7 5 33
## 8 10 31
## 9 3 31
## 10 16 30
## 11 9 30
## 12 8 27
## 13 7 26
## 14 6 23
## 15 D 22
## 16 12 19
## 17 2 17
## 18 B 15
## 19 A 13
## 20 1 12
ggplot(data = tmp, aes(boat, boat_cnt, fill = boat)) +
geom_bar(sta = "identity")
pie <- ggplot(tmp, aes(x=factor(1), y = boat_cnt, fill=boat))+
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0)
pie
cmp <- passenger %>%
filter(boat %in% c(1:16, "A", "B", "C", "D")) %>%
group_by(class, boat) %>%
summarise(boat_cnt = n()) #%>%
##arrange(desc(boat_cnt))
ggplot(cmp, aes(boat, fill = class)) +
geom_bar(position = "fill")
由圖表可看出Class1的旅客擁有較多搭乘救生艇的機會 -> 推論有錢人受到的待遇比較好
然而相較於Class2,Class3反而擁有較多搭乘救生艇的機會,和問題二的圖表相比較後,卻發現Class3的存活率比Class2低得多 -> 推論是由於Class3的人數較Class2的人數多很多,以至於在計算的時候比例會較高