In [1]:
# Import the pandas library.
import pandas
# Read in the airports data.
airports = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\airports.csv", header=None, dtype=str)
In [2]:
airports.head()
Out[2]:
In [3]:
airports=airports.iloc[:,0:12]
In [4]:
airports.head()
Out[4]:
In [5]:
airports.columns = ["id", "name", "city", "country", "code", "icao", "latitude", "longitude", "altitude", "offset", "dst", "timezone"]
In [6]:
airports.head()
Out[6]:
In [7]:
# Read in the airlines data.
airlines = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\airlines.csv", header=None, dtype=str, skiprows=1)
airlines.columns = ["id", "name", "alias", "iata", "icao", "callsign", "country", "active"]
In [8]:
airlines.head()
Out[8]:
In [9]:
# Read in the routes data.
routes = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\routes.csv", header=None, dtype=str)
routes.columns = ["airline", "airline_id", "source", "source_id", "dest", "dest_id", "codeshare", "stops", "equipment"]
In [10]:
routes.head()
Out[10]:
In [11]:
routes = routes[routes["airline_id"] != "\\N"]
Making a histogram¶
In [13]:
import math
def haversine(lon1, lat1, lon2, lat2):
# Convert coordinates to floats.
lon1, lat1, lon2, lat2 = [float(lon1), float(lat1), float(lon2), float(lat2)]
# Convert to radians from degrees.
lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
# Compute distance.
dlon = lon2 - lon1
dlat = lat2 - lat1
a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
c = 2 * math.asin(math.sqrt(a))
km = 6367 * c
return km
In [14]:
def calc_dist(row):
dist = 0
try:
# Match source and destination to get coordinates.
source = airports[airports["id"] == row["source_id"]].iloc[0]
dest = airports[airports["id"] == row["dest_id"]].iloc[0]
# Use coordinates to compute distance.
dist = haversine(dest["longitude"], dest["latitude"], source["longitude"], source["latitude"])
except (ValueError, IndexError):
pass
return dist
In [15]:
route_lengths = routes.apply(calc_dist, axis=1)
In [16]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(route_lengths, bins=20)
Out[16]:
Using Seaborn¶
In [17]:
import seaborn
seaborn.distplot(route_lengths, bins=20)
Out[17]:
Bar Charts¶
In [18]:
# from solution
import numpy
# Put relevant columns into a dataframe.
route_length_df = pandas.DataFrame({"length": route_lengths, "id": routes["airline_id"]})
route_length_df.head(2)
Out[18]:
In [19]:
#from solution
# Compute the mean route length per airline.
airline_route_lengths = route_length_df.groupby("id").aggregate(numpy.mean)
# Sort by length so we can make a better chart.
#airline_route_lengths = airline_route_lengths.sort("length", ascending=False)
airline_route_lengths = airline_route_lengths.sort_values("length", ascending=False)
airline_route_lengths.head(2)
Out[19]:
In [20]:
#from solution
#Simple Matplotlib
plt.bar(range(airline_route_lengths.shape[0]), airline_route_lengths["length"])
Out[20]:
In [21]:
#from solution
def lookup_name(row):
try:
# Match the row id to the id in the airlines dataframe so we can get the name.
name = airlines["name"][airlines["id"] == row["id"]].iloc[0]
except (ValueError, IndexError):
name = ""
return name
# Add the index (the airline ids) as a column.
airline_route_lengths["id"] = airline_route_lengths.index.copy()
airline_route_lengths.head()
Out[21]:
In [22]:
#from solution
# Find all the airline names.
airline_route_lengths["name"] = airline_route_lengths.apply(lookup_name, axis=1)
airline_route_lengths.head()
Out[22]:
In [23]:
#from solution
# Remove duplicate values in the index.
airline_route_lengths.index = range(airline_route_lengths.shape[0])
In [24]:
#from solution
airline_route_lengths.head()
Out[24]:
In [25]:
#from solution
import numpy as np
from bokeh.io import output_notebook, show
#from bokeh.charts import Bar, show
from bokeh.plotting import figure
output_notebook()
#airline_route_lengths2 = airline_route_lengths[:10]
airline_name = airline_route_lengths.name
avg_route_length = airline_route_lengths.length
p = figure(x_range=airline_name, plot_height=450, plot_width=1200, title="Airline Route Lengths",
toolbar_location=None, tools="")
p.vbar(x=airline_name, top=avg_route_length, width=1.9)
p.xgrid.grid_line_color = None
p.y_range.start = 0
#p.xaxis.major_label_orientation = "vertical"
p.xaxis.major_label_orientation = math.pi/4
show(p)
In [26]:
# from solution
#showing only the few top routes to eliminate the clutter
output_notebook()
airline_route_lengths10 = airline_route_lengths[:20]
airline_name10 = airline_route_lengths10.name
avg_route_length10 = airline_route_lengths10.length
p10 = figure(x_range=airline_name10, plot_height=450, plot_width=800, title="Airline Route Lengths to 10",
toolbar_location="below")
p10.vbar(x=airline_name10, top=avg_route_length, width=1.9)
p10.xgrid.grid_line_color = None
p10.y_range.start = 0
#p.xaxis.major_label_orientation = "vertical"
p10.xaxis.major_label_orientation = math.pi/4
show(p10)
Horizontal bar charts¶
In [28]:
long_routes = len([k for k in route_lengths if k > 10000]) / len(route_lengths)
medium_routes = len([k for k in route_lengths if k < 10000 and k > 2000]) / len(route_lengths)
short_routes = len([k for k in route_lengths if k < 2000]) / len(route_lengths)
In [29]:
import pygal
from IPython.display import SVG
import os
chart = pygal.HorizontalBar()
chart.title = 'Long, medium, and short routes'
chart.add('Long', long_routes * 100)
chart.add('Medium', medium_routes * 100)
chart.add('Short', short_routes * 100)
chart.render_to_file('routes.svg')
SVG(filename='routes.svg')
Out[29]:
Scatter plots¶
In [30]:
name_lengths = airlines["name"].apply(lambda x: len(str(x)))
plt.scatter(airlines["id"].astype(int), name_lengths)
Out[30]:
In [31]:
data = pandas.DataFrame({"lengths": name_lengths, "ids": airlines["id"].astype(int)})
seaborn.jointplot(x="ids", y="lengths", data=data)
Out[31]:
Static maps¶
In [31]:
# Import the basemap package
from mpl_toolkits.basemap import Basemap
In [32]:
# Create a map on which to draw. We're using a mercator projection, and showing the whole world.
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
# Draw coastlines, and the edges of the map.
m.drawcoastlines()
m.drawmapboundary()
# Convert latitude and longitude to x and y coordinates
x, y = m(list(airports["longitude"].astype(float)), list(airports["latitude"].astype(float)))
# Use matplotlib to draw the points onto the map.
m.scatter(x,y,1,marker='o',color='red')
# Show the plot.
plt.show()
Drawing great circles¶
In [33]:
import folium
from folium import plugins
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [34]:
airports_map = folium.Map(location=[30, 0], zoom_start=2)
airports_map
Out[34]:
In [35]:
from string import printable
st = set(printable)
airports["name"] = airports["name"].apply(lambda x: ''.join([" " if i not in st else i for i in x]))
In [36]:
# Randomly sample 7 elements from dataframe
airports_R = airports.sample(n=10)
In [37]:
for index, row in airports_R.iterrows():
if row["name"] != "South Pole Station":
folium.CircleMarker([float(row['latitude']), float(row['longitude'])],
radius=2,
popup=row['name'],
fill_color="red", # divvy color
).add_to(airports_map)
airports_map
Out[37]:
In [38]:
# Make a base map with a mercator projection. Draw the coastlines.
# Import the basemap package
from mpl_toolkits.basemap import Basemap
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
m.drawcoastlines()
# Iterate through the first 3000 rows.
for name, row in routes[:3000].iterrows():
try:
# Get the source and dest airports.
source = airports[airports["id"] == row["source_id"]].iloc[0]
dest = airports[airports["id"] == row["dest_id"]].iloc[0]
# Don't draw overly long routes.
if abs(float(source["longitude"]) - float(dest["longitude"])) < 90:
# Draw a great circle between source and dest airports.
m.drawgreatcircle(float(source["longitude"]), float(source["latitude"]), float(dest["longitude"]), float(dest["latitude"]),linewidth=1,color='b')
except (ValueError, IndexError):
pass
# Show the map.
plt.show()
Drawing network diagrams¶
In [39]:
# Initialize the weights dictionary.
weights = {}
# Keep track of keys that have been added once -- we only want edges with a weight of more than 1 to keep our network size manageable.
added_keys = []
# Iterate through each route.
for name, row in routes.iterrows():
# Extract the source and dest airport ids.
source = row["source_id"]
dest = row["dest_id"]
# Create a key for the weights dictionary.
# This corresponds to one edge, and has the start and end of the route.
key = "{0}_{1}".format(source, dest)
# If the key is already in weights, increment the weight.
if key in weights:
weights[key] += 1
# If the key is in added keys, initialize the key in the weights dictionary, with a weight of 2.
elif key in added_keys:
weights[key] = 2
# If the key isn't in added_keys yet, append it.
# This ensures that we aren't adding edges with a weight of 1.
else:
added_keys.append(key)
In [40]:
# Import networkx and initialize the graph.
import networkx as nx
graph = nx.Graph()
# Keep track of added nodes in this set so we don't add twice.
nodes = set()
# Iterate through each edge.
for k, weight in weights.items():
try:
# Split the source and dest ids and convert to integers.
source, dest = k.split("_")
source, dest = [int(source), int(dest)]
# Add the source if it isn't in the nodes.
if source not in nodes:
graph.add_node(source)
# Add the dest if it isn't in the nodes.
if dest not in nodes:
graph.add_node(dest)
# Add both source and dest to the nodes set.
# Sets don't allow duplicates.
nodes.add(source)
nodes.add(dest)
# Add the edge to the graph.
graph.add_edge(source, dest, weight=weight)
except (ValueError, IndexError):
pass
pos=nx.spring_layout(graph)
# Draw the nodes and edges.
nx.draw_networkx_nodes(graph,pos, node_color='red', node_size=10, alpha=0.8)
nx.draw_networkx_edges(graph,pos,width=1.0,alpha=1)
# Show the plot.
plt.show()