library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data = read.csv("C:/Users/User/Desktop/R HW/titanic.csv", header=T, sep=",")
分析票價和生存率的關係,以100元為分界點
#票價大於100元者
temp1 = select(data,fare,survival)%>%
filter(fare>=100)
asd = temp1[-which(temp1$survival==1),]
qwe = length(asd$survival==0)/length(temp1$survival)
print (qwe)
## [1] 0.4234234
#票價小於100元者
temp2 = select(data,fare,survival) %>%
filter(fare<=100)
zxc = temp2[-which(temp2$survival==1),]
fgh = length(zxc$survival==0)/length(temp2$survival)
print (fgh)
## [1] 0.647351
票價大於100者死亡率為0.423,而票價小於100者死亡率為0.647,可以猜測在事故發生時,票價較高者享有優先逃生的權力
分析傭人的搭船地點,生存率,和class等級
temp3 = select(data,survival,gender,joined,class,job,fare) %>%
filter(survival==1,class==1,job=='Personal Maid',gender==1)
temp4 = select(data,survival,gender,joined,class,job,age,fare) %>%
filter(survival==1,class==1,job=='Personal Maid',gender==0)
group_by(temp3,joined)
## # A tibble: 18 x 6
## # Groups: joined [2]
## survival gender joined class job fare
## <int> <int> <fctr> <fctr> <fctr> <int>
## 1 1 1 Southampton 1 Personal Maid 78
## 2 1 1 Cherbourg 1 Personal Maid 76
## 3 1 1 Cherbourg 1 Personal Maid 63
## 4 1 1 Cherbourg 1 Personal Maid 247
## 5 1 1 Southampton 1 Personal Maid 221
## 6 1 1 Cherbourg 1 Personal Maid 262
## 7 1 1 Southampton 1 Personal Maid 151
## 8 1 1 Cherbourg 1 Personal Maid 110
## 9 1 1 Southampton 1 Personal Maid 211
## 10 1 1 Southampton 1 Personal Maid 80
## 11 1 1 Southampton 1 Personal Maid 211
## 12 1 1 Cherbourg 1 Personal Maid 106
## 13 1 1 Cherbourg 1 Personal Maid 146
## 14 1 1 Southampton 1 Personal Maid 86
## 15 1 1 Southampton 1 Personal Maid 93
## 16 1 1 Southampton 1 Personal Maid 31
## 17 1 1 Cherbourg 1 Personal Maid 512
## 18 1 1 Cherbourg 1 Personal Maid 134
從分析結果我們可以發現傭人多來自於Southampton和Cherbourg,可推知這2個地區較為富裕,有趣的一點是人傭的生存率居然高達100%,其生存率高的原因大概是跟著雇主,所以同樣享有優先逃生的權力
class等級最高和最低存活率的比較
passenger = data[data$class %in% c(1,3),]
passenger = passenger[!passenger$fare == 9999,]
passenger = passenger[!passenger$age == 9999,]
head(passenger)
## name gender age class fare group
## 1 ALLEN, Miss Elisabeth Walton 1 29 1 211
## 2 ALLISON, Mr Hudson Joshua Creighton 0 30 1 151
## 3 ALLISON, Mrs Bessie Waldo 1 25 1 151
## 4 ALLISON, Miss Helen Loraine 1 2 1 151
## 5 ALLISON, Master Hudson Trevor 0 1 1 151
## 6 ANDERSON, Mr Harry 0 47 1 26
## joined job boat survival
## 1 Southampton 2 1
## 2 Southampton Businessman 0
## 3 Southampton 0
## 4 Southampton 0
## 5 Southampton 11 1
## 6 Southampton Stockbroker 3 1
#class1生存率
Class1 = select(passenger, survival, class) %>%
filter(class == 1)
Class1alive = sum(Class1$survival == 1)
Class1dead = sum(Class1$survival == 0)
Class1total = length(Class1$survival)
Class1survivalrate = Class1alive / Class1total
print (Class1survivalrate)
## [1] 0.6453674
#class3生存率
Class3 = select(passenger, survival, class) %>%
filter(class == 3)
Class3alive = sum(Class3$survival == 1)
Class3dead = sum(Class3$survival == 0)
Class3total = length(Class3$survival)
Class3survivalrate = Class3alive / Class3total
print (Class3survivalrate)
## [1] 0.2453638
#兩者生存率的比較
CLASS = data.frame(Class = c("Class1", "Class3"),
survivalrate = c(Class1survivalrate, Class3survivalrate))
ggplot(data = CLASS, aes(x = Class, y = survivalrate, fill = Class)) +
geom_bar(stat = "identity")
由結果推論等級越高,存活率越大,且Class1和Class3存活率快相差3倍