Analysis of the “Airplane Crashes and Fatalities Since 1908” dataset(source:https://opendata.socrata.com/Government/Airplane-Crashes-and-Fatalities-Since-1908/q2te-8cvq) The main purpose of this homework is to practice dypler package of R, including ‘filter’, ‘select’,‘mutate’, ‘arrange’, summarise, and group_by. On the other hand, tring to find any interesting trends or behaviors when analyzing the dataset.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
crashdata<- read.csv("Airplane_Crashes_and_Fatalities_Since_1908.csv", header=T, sep=",")
dim(crashdata)
## [1] 5268 13
str(crashdata)
## 'data.frame': 5268 obs. of 13 variables:
## $ Date : Factor w/ 4753 levels "01/01/1966","01/01/1970",..: 3297 2372 2699 3184 3682 852 3099 2585 3387 3469 ...
## $ Time : Factor w/ 1006 levels "","00:00","00:01",..: 702 200 1 758 385 36 609 1 36 986 ...
## $ Location : Factor w/ 4304 levels "Sept-\xc3Žles, Canada",..: 851 126 4179 3391 2297 4059 3120 2291 286 3552 ...
## $ Operator : Factor w/ 2477 levels "Alpine Aviation\xc2\xa0",..: 1568 1579 1826 1467 1467 1467 1467 1466 1467 1467 ...
## $ Flight.. : Factor w/ 725 levels "","-","002","004",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ Route : Factor w/ 3245 levels "Gaspé - \xc3Žles-de-la-Madeleine",..: 832 2982 2 2 2 2 2 2 2 2 ...
## $ Type : Factor w/ 2447 levels "Short S23 \xe2\u0080˜C\xe2\u0080\x99 Class flying boat",..: 2419 1146 1033 2433 2435 2446 2434 2149 2439 2438 ...
## $ Registration: Factor w/ 4906 levels "C-FJJR\xc2\xa0",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ cn.In : Factor w/ 3708 levels "1G137-26\xc2\xa0",..: 172 2 2 2 2 2 2 2 2 2 ...
## $ Aboard : int 2 5 1 20 30 41 19 20 22 19 ...
## $ Fatalities : int 1 5 1 14 30 21 19 20 22 19 ...
## $ Ground : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Summary : Factor w/ 4674 levels "Shortly after departing Perth and after the aircraft had climbed through its assigned altitude, the pilot\xe2\u0080™s speech bec"| __truncated__,..: 1558 1690 3574 3220 1834 1203 1651 1217 2239 2279 ...
#Date sturcture is mm/dd/yyyy
crashdata$year<-substr(crashdata$Date, start =7 , stop =10 )
crashdata$month<-as.factor(substr(crashdata$Date, start =1 , stop =2 ))
crashdata$day<-substr(crashdata$Date, start =4 , stop =5)
#death events in each month
death_month<-
crashdata%>%
group_by(month)%>%
summarise(cumulative_number=n())
ggplot(death_month, aes(x=month,y=cumulative_number)) +
geom_bar(stat="identity")+
geom_text(aes(label=cumulative_number),vjust=1.5,colour="white")
其中456月較低,但發生空難次數各月並沒有顯著的差別。
#death events in each year
death_year<-
crashdata%>%
select(year,Aboard,Fatalities)%>%
group_by(year)%>%
na.omit()%>%
summarise(events_number=n(),aboard=sum(Aboard),fatalities=sum(Fatalities),survival=sum(Aboard)-sum(Fatalities))
death_year$year<-as.integer(death_year$year)
death_year$survival_rate<-death_year$survival/death_year$aboard
g1<-ggplot(death_year, aes(year,events_number)) +
geom_line()+
ggtitle("number of events")+
theme(plot.title=element_text(vjust=-1.5))+
theme(axis.title.x=element_blank())
g2<-ggplot(death_year, aes(year,aboard)) +
geom_line()+
ggtitle("aboard ")+
theme(plot.title=element_text(vjust=-1.5))+
theme(axis.title.x=element_blank())
g3<-ggplot(death_year, aes(year,fatalities)) +
geom_line()+
ggtitle("fatalities" )+
theme(plot.title=element_text(vjust=-1.5))+
theme(axis.title.x=element_blank())
g4<-ggplot(death_year, aes(year,survival_rate)) +
geom_line()+
ggtitle("survival_rate")+
theme(plot.title=element_text(vjust=-1.5))+
theme(axis.title.x=element_blank())
grid.arrange(g1, g2,g3,g4,nrow = 4)
就年度來看,空難次數由1908年起開始隨時間增加,到約1973年達到最高峰,但2000年以後亦逐步下降。 猜測1908年起增加係因飛機數量增加,飛航的總次數亦增加,但資料目前欠缺飛航的總次數,無法佐證。 空難的登機人數、死亡人數和空難次數大致有一致的趨勢。 但在死亡率部份,反到1998年最高,近年來亦無減少的態勢,可見,時代進步亦無法降低空難之死亡率。
death_operator<-
crashdata%>%
filter(Fatalities>1& !is.na(crashdata$Operator) )%>%
select(Operator,Aboard,Fatalities)%>%
group_by(Operator)%>%
na.omit()%>%
summarise(cumulative_number=n(),fatalities=sum(Fatalities))%>%
arrange(desc(fatalities))
death_operator_top<-head(death_operator,5)
ggplot(death_operator_top, aes(x=reorder(Operator, -fatalities),y=fatalities)) +
geom_bar(stat="identity")+
geom_text(aes(label=fatalities),vjust=1.5,colour="white")
因為太多家航空公司會無法聚焦,直接用head函數取前5名的航空公司, Aeroflot 俄羅斯航空公開股份公司,簡稱俄航,是俄羅斯的國家航空公司 Military - U.S. Air Force Air France 美國空軍 American Airlines 美國航空 Pan American World Airways 泛美航空 前5間最常發生空難公司有4間是美國,一間是俄羅斯
death_location_tmp<-
crashdata%>%
select(Location,Aboard,Fatalities)%>%
group_by(Location)%>%
na.omit()
x<-strsplit(as.character(death_location_tmp$Location), split = ",")
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 1790 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 2289 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 2419 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 4630 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
x[[1]]
## [1] "Fort Myer" " Virginia"
tail(x[[1]],1)
## [1] " Virginia"
death_location_tmp$location = as.character(death_location_tmp$Location)
for (i in 1:length(death_location_tmp$Location)){
if( length(tail(x[[i]],1)) == 0 )
{
death_location_tmp$location[i] = ""
}
else
{
death_location_tmp$location[i]<-tail(x[[i]],1)
}
}
#此處出現問題,想把death_location_tmp取,後的字,但無法成功
death_location<-death_location_tmp%>%
group_by(location)%>%
summarise(cumulative_number=n(),fatalities=sum(Fatalities))%>%
arrange(desc(fatalities))
death_location_top<- head(death_location,5)
ggplot(death_location_top, aes(x=reorder(location, -fatalities) ,y=fatalities)) +
geom_bar(stat="identity")+
geom_text(aes(label=fatalities),vjust=1.5,colour="white")