HW2 Airplane Crashes Since 1908

Analysis of the “Airplane Crashes and Fatalities Since 1908” dataset(source:https://opendata.socrata.com/Government/Airplane-Crashes-and-Fatalities-Since-1908/q2te-8cvq) The main purpose of this homework is to practice dypler package of R, including ‘filter’, ‘select’,‘mutate’, ‘arrange’, summarise, and group_by. On the other hand, tring to find any interesting trends or behaviors when analyzing the dataset.

Import data and View data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
crashdata<- read.csv("Airplane_Crashes_and_Fatalities_Since_1908.csv", header=T, sep=",")
dim(crashdata)
## [1] 5268   13
str(crashdata)
## 'data.frame':    5268 obs. of  13 variables:
##  $ Date        : Factor w/ 4753 levels "01/01/1966","01/01/1970",..: 3297 2372 2699 3184 3682 852 3099 2585 3387 3469 ...
##  $ Time        : Factor w/ 1006 levels "","00:00","00:01",..: 702 200 1 758 385 36 609 1 36 986 ...
##  $ Location    : Factor w/ 4304 levels "Sept-\xc3Žles, Canada",..: 851 126 4179 3391 2297 4059 3120 2291 286 3552 ...
##  $ Operator    : Factor w/ 2477 levels "Alpine Aviation\xc2\xa0",..: 1568 1579 1826 1467 1467 1467 1467 1466 1467 1467 ...
##  $ Flight..    : Factor w/ 725 levels "","-","002","004",..: 1 1 2 1 1 1 1 1 1 1 ...
##  $ Route       : Factor w/ 3245 levels "Gaspé - \xc3Žles-de-la-Madeleine",..: 832 2982 2 2 2 2 2 2 2 2 ...
##  $ Type        : Factor w/ 2447 levels "Short S23 \xe2\u0080˜C\xe2\u0080\x99 Class flying boat",..: 2419 1146 1033 2433 2435 2446 2434 2149 2439 2438 ...
##  $ Registration: Factor w/ 4906 levels "C-FJJR\xc2\xa0",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ cn.In       : Factor w/ 3708 levels "1G137-26\xc2\xa0",..: 172 2 2 2 2 2 2 2 2 2 ...
##  $ Aboard      : int  2 5 1 20 30 41 19 20 22 19 ...
##  $ Fatalities  : int  1 5 1 14 30 21 19 20 22 19 ...
##  $ Ground      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Summary     : Factor w/ 4674 levels "Shortly after departing Perth and after the aircraft had climbed through its assigned altitude, the pilot\xe2\u0080™s speech bec"| __truncated__,..: 1558 1690 3574 3220 1834 1203 1651 1217 2239 2279 ...

Import data and View data

Q1:the planes crashed each month

#Date sturcture is mm/dd/yyyy
crashdata$year<-substr(crashdata$Date, start =7 , stop =10 )
crashdata$month<-as.factor(substr(crashdata$Date, start =1 , stop =2 ))
crashdata$day<-substr(crashdata$Date, start =4 , stop =5)

#death events in each month
death_month<-
  crashdata%>%
  group_by(month)%>%
  summarise(cumulative_number=n())
ggplot(death_month, aes(x=month,y=cumulative_number)) + 
  geom_bar(stat="identity")+
  geom_text(aes(label=cumulative_number),vjust=1.5,colour="white")

其中456月較低,但發生空難次數各月並沒有顯著的差別。

Q2:the planes crashed per year

Q3:people aboard per year during crashes

Q4:people dead per year during crashes

Q5:people survived rate per year during crashes

#death events in each year
death_year<-
  crashdata%>%
  select(year,Aboard,Fatalities)%>%
  group_by(year)%>%
  na.omit()%>%
  summarise(events_number=n(),aboard=sum(Aboard),fatalities=sum(Fatalities),survival=sum(Aboard)-sum(Fatalities))

death_year$year<-as.integer(death_year$year)
death_year$survival_rate<-death_year$survival/death_year$aboard
g1<-ggplot(death_year, aes(year,events_number)) + 
  geom_line()+
  ggtitle("number of events")+
  theme(plot.title=element_text(vjust=-1.5))+
  theme(axis.title.x=element_blank())
g2<-ggplot(death_year, aes(year,aboard)) + 
  geom_line()+
  ggtitle("aboard ")+
            theme(plot.title=element_text(vjust=-1.5))+
  theme(axis.title.x=element_blank())
g3<-ggplot(death_year, aes(year,fatalities)) + 
  geom_line()+
  ggtitle("fatalities" )+
            theme(plot.title=element_text(vjust=-1.5))+
  theme(axis.title.x=element_blank())
g4<-ggplot(death_year, aes(year,survival_rate)) + 
  geom_line()+
  ggtitle("survival_rate")+
  theme(plot.title=element_text(vjust=-1.5))+
  theme(axis.title.x=element_blank())
grid.arrange(g1, g2,g3,g4,nrow = 4)

就年度來看,空難次數由1908年起開始隨時間增加,到約1973年達到最高峰,但2000年以後亦逐步下降。 猜測1908年起增加係因飛機數量增加,飛航的總次數亦增加,但資料目前欠缺飛航的總次數,無法佐證。 空難的登機人數、死亡人數和空難次數大致有一致的趨勢。 但在死亡率部份,反到1998年最高,近年來亦無減少的態勢,可見,時代進步亦無法降低空難之死亡率。

Q6:Airlines to avoid

death_operator<-
  crashdata%>%
  filter(Fatalities>1& !is.na(crashdata$Operator) )%>%
  select(Operator,Aboard,Fatalities)%>%
  group_by(Operator)%>%
  na.omit()%>%
  summarise(cumulative_number=n(),fatalities=sum(Fatalities))%>%
  arrange(desc(fatalities))
death_operator_top<-head(death_operator,5)

ggplot(death_operator_top, aes(x=reorder(Operator, -fatalities),y=fatalities)) + 
  geom_bar(stat="identity")+
  geom_text(aes(label=fatalities),vjust=1.5,colour="white")

因為太多家航空公司會無法聚焦,直接用head函數取前5名的航空公司, Aeroflot 俄羅斯航空公開股份公司,簡稱俄航,是俄羅斯的國家航空公司 Military - U.S. Air Force Air France 美國空軍 American Airlines 美國航空 Pan American World Airways 泛美航空 前5間最常發生空難公司有4間是美國,一間是俄羅斯

Q6:問題請教 本來還有進行空難地點分析

death_location_tmp<-
  crashdata%>%
  select(Location,Aboard,Fatalities)%>%
  group_by(Location)%>%
  na.omit()
x<-strsplit(as.character(death_location_tmp$Location), split = ",")
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 1790 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 2289 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 2419 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
## Warning in strsplit(as.character(death_location_tmp$Location), split =
## ","): ¿é¤Jªº¦r¦ê 4630 ¤£¾A¥Î©ó¦¹»y¨¥Àô¹Ò
x[[1]]
## [1] "Fort Myer" " Virginia"
tail(x[[1]],1)
## [1] " Virginia"
death_location_tmp$location = as.character(death_location_tmp$Location)
for (i in 1:length(death_location_tmp$Location)){
  if( length(tail(x[[i]],1)) == 0 )
  {
    death_location_tmp$location[i] = ""
  }
  else
  {
    death_location_tmp$location[i]<-tail(x[[i]],1)
  }
}
#此處出現問題,想把death_location_tmp取,後的字,但無法成功

death_location<-death_location_tmp%>%
  group_by(location)%>%
  summarise(cumulative_number=n(),fatalities=sum(Fatalities))%>%
  arrange(desc(fatalities))
  
 death_location_top<- head(death_location,5)
 ggplot(death_location_top, aes(x=reorder(location, -fatalities) ,y=fatalities)) + 
  geom_bar(stat="identity")+
  geom_text(aes(label=fatalities),vjust=1.5,colour="white")