library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
## Warning: package 'hflights' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
hf = tbl_df(hflights)
Q1.一年內哪一些時段容易誤點。
q1 =hf
q1 =
select(q1,Month,ArrDelay,ArrTime,DepDelay)%>%
filter(DepDelay > 0,ArrDelay>0)%>%
group_by(Month)%>%
summarise(avg_ArrDelay=mean(ArrDelay),avg_DepDelay=mean(DepDelay),times=n(),avg_delay= (avg_DepDelay+avg_ArrDelay)*0.5)
## Warning: package 'bindrcpp' was built under R version 3.3.3
#為了避免各月份的總航班本身就有差距,只以延遲的航班總數判斷頻率不公平
pa = hf
pa =
select(pa,Month)%>%
table()%>%
as.data.frame()
names(pa)=c("Month","totaltime")
q1 =
merge(q1, pa, by = "Month")%>%
mutate(freq = paste(round(100 *(times) / (totaltime)),"%"))%>%
arrange(desc( freq ))
q1$Month<- as.factor(q1$Month)
print(q1)
## Month avg_ArrDelay avg_DepDelay times avg_delay totaltime freq
## 1 6 30.49584 29.84637 8299 30.17110 19600 42 %
## 2 5 34.74115 31.23761 7908 32.98938 19172 41 %
## 3 7 31.05423 31.28271 8316 31.16847 20548 40 %
## 4 4 32.46271 29.16866 7186 30.81568 18593 39 %
## 5 3 30.75853 29.93786 6920 30.34819 19470 36 %
## 6 1 24.35494 24.91448 6525 24.63471 18910 35 %
## 7 12 30.75739 33.20635 6397 31.98187 19117 33 %
## 8 2 29.82184 29.68935 5495 29.75560 17128 32 %
## 9 8 27.06185 27.55426 6257 27.30805 20176 31 %
## 10 11 27.81474 27.57439 5155 27.69457 18021 29 %
## 11 9 28.26861 27.27464 5145 27.77162 18065 28 %
## 12 10 29.76657 29.63132 5205 29.69894 18696 28 %
qplot(x=freq,
y=avg_delay,
data=q1,
geom="point", # 圖形=scatter plot
main = "Scatter Plot of delaytime-freq",
xlab="frequence",
ylab="delaytime",
color= Month # 以顏色標註月份,複合式的散布圖
)
## 可以看出前半年(1~6月)無論時間或頻率都偏高
Q2.飛行長短是否會使航班容易延誤?
Q2=hf
Q2=
select(Q2,Distance,ArrDelay,DepDelay)%>%
filter(ArrDelay>0,DepDelay > 0)%>%
mutate(realmiss=ArrDelay-DepDelay)%>%#扣除出發延誤造成的抵達延誤,讓飛行長短成為主要差異
group_by(Distance)%>%
summarise(times=n(),realmiss=mean(realmiss))
#飛行距離與延遲時間長短完全沒有關係
pa2 = hf
pa2 =
select(pa2,Distance)%>%
table()%>%
as.data.frame()
names(pa2)=c("Distance","total")
Q2 =
merge(Q2, pa2, by = "Distance")%>%
mutate(freq = 100 *(times) / (total))%>%
filter(total > 100)#班次太少頻率的參考值沒有意義
ggplot(data=Q2)+
geom_point(aes(
y=realmiss,
x=Distance,
main = "飛行距離與延遲關係",
color= freq
)
)
## Warning: Ignoring unknown aesthetics: main
##雖然飛行距離與延遲,不論是長短或次數都沒有明顯影響,但可以看出'出發延遲但仍抵達並沒有太嚴重延遲的情況(realmiss<0)'明顯偏少,即如果出發延遲了的話,抵達有很高的機率會延遲更久。
一周哪些時段是各航線的巔峰時段
Q3 = hf
Q3 =
select(Q3,DayOfWeek,Distance,Origin,Dest)%>%
mutate(Distance = paste(Origin,">>",Dest))#先區分各條航線
tbl = table(Q3$Distance,Q3$DayOfWeek)%>%
as.data.frame()%>%
arrange(desc(Freq))#有些航線的班次遠多於其他航班,在此只挑班次最多的前三條航線觀察("HOU >> DAL"& "IAH >> ORD" &"IAH >> ATL")
Q3 =
filter(Q3,(Q3$Distance=="HOU >> DAL") | (Q3$Distance=="IAH >> ATL") |(Q3$Distance=="IAH >> ORD"))%>%
select(DayOfWeek,Distance)
tbl =
table(Q3)%>%
as.data.frame.matrix()
pie(tbl$`HOU >> DAL`)
pie(tbl$`IAH >> ORD`)
pie(tbl$`IAH >> ATL`)
##由結果圖只能稍微判斷HOU >> DAL在星期六的班次最少,另外兩條航線的航班分布平均