library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hflights)
## Warning: package 'hflights' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
hf = tbl_df(hflights)

Q1.一年內哪一些時段容易誤點。

q1 =hf
q1 = 
  select(q1,Month,ArrDelay,ArrTime,DepDelay)%>%
  filter(DepDelay > 0,ArrDelay>0)%>%
  group_by(Month)%>%
  summarise(avg_ArrDelay=mean(ArrDelay),avg_DepDelay=mean(DepDelay),times=n(),avg_delay= (avg_DepDelay+avg_ArrDelay)*0.5)
## Warning: package 'bindrcpp' was built under R version 3.3.3
  #為了避免各月份的總航班本身就有差距,只以延遲的航班總數判斷頻率不公平
pa = hf
pa = 
  select(pa,Month)%>%
  table()%>%
  as.data.frame()
names(pa)=c("Month","totaltime")
q1 = 
  merge(q1, pa, by = "Month")%>%
  mutate(freq = paste(round(100 *(times) / (totaltime)),"%"))%>%
  arrange(desc( freq ))
q1$Month<- as.factor(q1$Month) 
print(q1)
##    Month avg_ArrDelay avg_DepDelay times avg_delay totaltime freq
## 1      6     30.49584     29.84637  8299  30.17110     19600 42 %
## 2      5     34.74115     31.23761  7908  32.98938     19172 41 %
## 3      7     31.05423     31.28271  8316  31.16847     20548 40 %
## 4      4     32.46271     29.16866  7186  30.81568     18593 39 %
## 5      3     30.75853     29.93786  6920  30.34819     19470 36 %
## 6      1     24.35494     24.91448  6525  24.63471     18910 35 %
## 7     12     30.75739     33.20635  6397  31.98187     19117 33 %
## 8      2     29.82184     29.68935  5495  29.75560     17128 32 %
## 9      8     27.06185     27.55426  6257  27.30805     20176 31 %
## 10    11     27.81474     27.57439  5155  27.69457     18021 29 %
## 11     9     28.26861     27.27464  5145  27.77162     18065 28 %
## 12    10     29.76657     29.63132  5205  29.69894     18696 28 %
qplot(x=freq,                               
      y=avg_delay,                              
      data=q1,                      
      geom="point",                         # 圖形=scatter plot
      main = "Scatter Plot of delaytime-freq",  
      xlab="frequence",                          
      ylab="delaytime",                    
      color= Month                          # 以顏色標註月份,複合式的散布圖
      )

## 可以看出前半年(1~6月)無論時間或頻率都偏高

Q2.飛行長短是否會使航班容易延誤?

Q2=hf
Q2=
  select(Q2,Distance,ArrDelay,DepDelay)%>%
  filter(ArrDelay>0,DepDelay > 0)%>%
  mutate(realmiss=ArrDelay-DepDelay)%>%#扣除出發延誤造成的抵達延誤,讓飛行長短成為主要差異
  group_by(Distance)%>%
  summarise(times=n(),realmiss=mean(realmiss))

#飛行距離與延遲時間長短完全沒有關係
pa2 = hf
pa2 = 
  select(pa2,Distance)%>%
  table()%>%
  as.data.frame()
names(pa2)=c("Distance","total")
Q2 = 
  merge(Q2, pa2, by = "Distance")%>%
  mutate(freq = 100 *(times) / (total))%>%
  filter(total > 100)#班次太少頻率的參考值沒有意義
  
ggplot(data=Q2)+
  geom_point(aes(
                y=realmiss,
                x=Distance,                             
                main = "飛行距離與延遲關係",
                color= freq
              )

)
## Warning: Ignoring unknown aesthetics: main

##雖然飛行距離與延遲,不論是長短或次數都沒有明顯影響,但可以看出'出發延遲但仍抵達並沒有太嚴重延遲的情況(realmiss<0)'明顯偏少,即如果出發延遲了的話,抵達有很高的機率會延遲更久。

一周哪些時段是各航線的巔峰時段

Q3 = hf
Q3 =
  select(Q3,DayOfWeek,Distance,Origin,Dest)%>%
  mutate(Distance = paste(Origin,">>",Dest))#先區分各條航線
tbl = table(Q3$Distance,Q3$DayOfWeek)%>%
      as.data.frame()%>%
      arrange(desc(Freq))#有些航線的班次遠多於其他航班,在此只挑班次最多的前三條航線觀察("HOU >> DAL"& "IAH >> ORD" &"IAH >>  ATL")
Q3 = 
  filter(Q3,(Q3$Distance=="HOU >> DAL") | (Q3$Distance=="IAH >> ATL") |(Q3$Distance=="IAH >> ORD"))%>%
  select(DayOfWeek,Distance)
tbl = 
  table(Q3)%>%
  as.data.frame.matrix()

pie(tbl$`HOU >> DAL`)

pie(tbl$`IAH >> ORD`)

pie(tbl$`IAH >> ATL`)

##由結果圖只能稍微判斷HOU >> DAL在星期六的班次最少,另外兩條航線的航班分布平均