# Import the pandas library.
import pandas
# Read in the airports data.
airports = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\airports.csv", header=None, dtype=str)

airports.head()

airports=airports.iloc[:,0:12]

airports.head()

airports.columns = ["id", "name", "city", "country", "code", "icao", "latitude", "longitude", "altitude", "offset", "dst", "timezone"]

airports.head()

# Read in the airlines data.
airlines = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\airlines.csv", header=None, dtype=str, skiprows=1)
airlines.columns = ["id", "name", "alias", "iata", "icao", "callsign", "country", "active"]

airlines.head()

# Read in the routes data.
routes = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\routes.csv", header=None, dtype=str)
routes.columns = ["airline", "airline_id", "source", "source_id", "dest", "dest_id", "codeshare", "stops", "equipment"]

routes.head()

routes = routes[routes["airline_id"] != "\\N"]

Making a histogram¶

import math

def haversine(lon1, lat1, lon2, lat2):
    # Convert coordinates to floats.
    lon1, lat1, lon2, lat2 = [float(lon1), float(lat1), float(lon2), float(lat2)]
    # Convert to radians from degrees.
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    # Compute distance.
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 
    km = 6367 * c
    return km

def calc_dist(row):
    dist = 0
    try:
        # Match source and destination to get coordinates.
        source = airports[airports["id"] == row["source_id"]].iloc[0]
        dest = airports[airports["id"] == row["dest_id"]].iloc[0]
        # Use coordinates to compute distance.
        dist = haversine(dest["longitude"], dest["latitude"], source["longitude"], source["latitude"])
    except (ValueError, IndexError):
        pass
    return dist

route_lengths = routes.apply(calc_dist, axis=1)

import matplotlib.pyplot as plt
%matplotlib inline

plt.hist(route_lengths, bins=20)

(array([2.3102e+04, 1.9736e+04, 9.9470e+03, 5.2980e+03, 2.5900e+03,
        1.0630e+03, 8.3900e+02, 1.0170e+03, 9.1200e+02, 7.7100e+02,
        6.4900e+02, 5.5300e+02, 2.4700e+02, 2.3200e+02, 1.4600e+02,
        4.4000e+01, 3.2000e+01, 2.0000e+00, 0.0000e+00, 4.0000e+00]),
 array([    0.        ,   803.60790186,  1607.21580372,  2410.82370558,
         3214.43160744,  4018.0395093 ,  4821.64741116,  5625.25531302,
         6428.86321489,  7232.47111675,  8036.07901861,  8839.68692047,
         9643.29482233, 10446.90272419, 11250.51062605, 12054.11852791,
        12857.72642977, 13661.33433163, 14464.94223349, 15268.55013535,
        16072.15803721]),
 <a list of 20 Patch objects>)

Using Seaborn¶

import seaborn
seaborn.distplot(route_lengths, bins=20)

C:\Users\user\Anaconda3\New folder\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

<matplotlib.axes._subplots.AxesSubplot at 0x16848518>

Bar Charts¶

# from solution
import numpy

# Put relevant columns into a dataframe.
route_length_df = pandas.DataFrame({"length": route_lengths, "id": routes["airline_id"]})
route_length_df.head(2)

#from solution
# Compute the mean route length per airline.
airline_route_lengths = route_length_df.groupby("id").aggregate(numpy.mean)
# Sort by length so we can make a better chart.
#airline_route_lengths = airline_route_lengths.sort("length", ascending=False)
airline_route_lengths = airline_route_lengths.sort_values("length", ascending=False)
airline_route_lengths.head(2)

#from solution
#Simple Matplotlib
plt.bar(range(airline_route_lengths.shape[0]), airline_route_lengths["length"])

<BarContainer object of 547 artists>

#from solution
def lookup_name(row):
    try:
        # Match the row id to the id in the airlines dataframe so we can get the name.
        name = airlines["name"][airlines["id"] == row["id"]].iloc[0]
    except (ValueError, IndexError):
        name = ""
    return name

# Add the index (the airline ids) as a column.
airline_route_lengths["id"] = airline_route_lengths.index.copy()
airline_route_lengths.head()

#from solution
# Find all the airline names.
airline_route_lengths["name"] = airline_route_lengths.apply(lookup_name, axis=1)

airline_route_lengths.head()

#from solution
# Remove duplicate values in the index.
airline_route_lengths.index = range(airline_route_lengths.shape[0])

#from solution
airline_route_lengths.head()

#from solution
import numpy as np
from bokeh.io import output_notebook, show
#from bokeh.charts import Bar, show
from bokeh.plotting import figure

output_notebook()

#airline_route_lengths2 = airline_route_lengths[:10]

airline_name = airline_route_lengths.name
avg_route_length = airline_route_lengths.length

p = figure(x_range=airline_name, plot_height=450, plot_width=1200, title="Airline Route Lengths",
           toolbar_location=None, tools="")

p.vbar(x=airline_name, top=avg_route_length, width=1.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
#p.xaxis.major_label_orientation = "vertical"
p.xaxis.major_label_orientation = math.pi/4


show(p)

# from solution
#showing only the few top routes to eliminate the clutter

output_notebook()

airline_route_lengths10 = airline_route_lengths[:20]

airline_name10 = airline_route_lengths10.name
avg_route_length10 = airline_route_lengths10.length

p10 = figure(x_range=airline_name10, plot_height=450, plot_width=800, title="Airline Route Lengths to 10",
            toolbar_location="below")

p10.vbar(x=airline_name10, top=avg_route_length, width=1.9)

p10.xgrid.grid_line_color = None
p10.y_range.start = 0
#p.xaxis.major_label_orientation = "vertical"
p10.xaxis.major_label_orientation = math.pi/4


show(p10)

BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('top', 547), ('x', 20)

Horizontal bar charts¶

long_routes = len([k for k in route_lengths if k > 10000]) / len(route_lengths)
medium_routes = len([k for k in route_lengths if k < 10000 and k > 2000]) / len(route_lengths)
short_routes = len([k for k in route_lengths if k < 2000]) / len(route_lengths)

import pygal
from IPython.display import SVG
import os
chart = pygal.HorizontalBar()
chart.title = 'Long, medium, and short routes'
chart.add('Long', long_routes * 100)
chart.add('Medium', medium_routes * 100)
chart.add('Short', short_routes * 100)
chart.render_to_file('routes.svg')
SVG(filename='routes.svg')

Scatter plots¶

name_lengths = airlines["name"].apply(lambda x: len(str(x)))
plt.scatter(airlines["id"].astype(int), name_lengths)

<matplotlib.collections.PathCollection at 0x16351a58>

data = pandas.DataFrame({"lengths": name_lengths, "ids": airlines["id"].astype(int)})
seaborn.jointplot(x="ids", y="lengths", data=data)

C:\Users\user\Anaconda3\New folder\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

<seaborn.axisgrid.JointGrid at 0x14db0c50>

Static maps¶

# Import the basemap package
from mpl_toolkits.basemap import Basemap

# Create a map on which to draw.  We're using a mercator projection, and showing the whole world.
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
# Draw coastlines, and the edges of the map.
m.drawcoastlines()
m.drawmapboundary()
# Convert latitude and longitude to x and y coordinates
x, y = m(list(airports["longitude"].astype(float)), list(airports["latitude"].astype(float)))
# Use matplotlib to draw the points onto the map.
m.scatter(x,y,1,marker='o',color='red')
# Show the plot.
plt.show()

C:\Users\user\Anaconda3\New folder\lib\site-packages\mpl_toolkits\basemap\__init__.py:1708: MatplotlibDeprecationWarning: The axesPatch function was deprecated in version 2.1. Use Axes.patch instead.
  limb = ax.axesPatch
C:\Users\user\Anaconda3\New folder\lib\site-packages\mpl_toolkits\basemap\__init__.py:1711: MatplotlibDeprecationWarning: The axesPatch function was deprecated in version 2.1. Use Axes.patch instead.
  if limb is not ax.axesPatch:

Drawing great circles¶

import folium
from folium import plugins
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

airports_map = folium.Map(location=[30, 0], zoom_start=2)
airports_map

from string import printable
st = set(printable)
airports["name"] = airports["name"].apply(lambda x: ''.join([" " if  i not in  st else i for i in x]))

# Randomly sample 7 elements from dataframe
airports_R = airports.sample(n=10)

for index, row in airports_R.iterrows():
    if row["name"] != "South Pole Station":
        folium.CircleMarker([float(row['latitude']), float(row['longitude'])],
                        radius=2,
                        popup=row['name'],
                        fill_color="red", # divvy color
                       ).add_to(airports_map)
airports_map

# Make a base map with a mercator projection.  Draw the coastlines.
# Import the basemap package
from mpl_toolkits.basemap import Basemap
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
m.drawcoastlines()

# Iterate through the first 3000 rows.
for name, row in routes[:3000].iterrows():
    try:
        # Get the source and dest airports.
        source = airports[airports["id"] == row["source_id"]].iloc[0]
        dest = airports[airports["id"] == row["dest_id"]].iloc[0]
        # Don't draw overly long routes.
        if abs(float(source["longitude"]) - float(dest["longitude"])) < 90:
            # Draw a great circle between source and dest airports.
            m.drawgreatcircle(float(source["longitude"]), float(source["latitude"]), float(dest["longitude"]), float(dest["latitude"]),linewidth=1,color='b')
    except (ValueError, IndexError):
        pass
    
# Show the map.
plt.show()

Drawing network diagrams¶

# Initialize the weights dictionary.
weights = {}
# Keep track of keys that have been added once -- we only want edges with a weight of more than 1 to keep our network size manageable.
added_keys = []
# Iterate through each route.
for name, row in routes.iterrows():
    # Extract the source and dest airport ids.
    source = row["source_id"]
    dest = row["dest_id"]
    
    # Create a key for the weights dictionary.
    # This corresponds to one edge, and has the start and end of the route.
    key = "{0}_{1}".format(source, dest)
    # If the key is already in weights, increment the weight.
    if key in weights:
        weights[key] += 1
    # If the key is in added keys, initialize the key in the weights dictionary, with a weight of 2.
    elif key in added_keys:
        weights[key] = 2
    # If the key isn't in added_keys yet, append it.
    # This ensures that we aren't adding edges with a weight of 1.
    else:
        added_keys.append(key)

# Import networkx and initialize the graph.
import networkx as nx
graph = nx.Graph()
# Keep track of added nodes in this set so we don't add twice.
nodes = set()
# Iterate through each edge.
for k, weight in weights.items():
    try:
        # Split the source and dest ids and convert to integers.
        source, dest = k.split("_")
        source, dest = [int(source), int(dest)]
        # Add the source if it isn't in the nodes.
        if source not in nodes:
            graph.add_node(source)
        # Add the dest if it isn't in the nodes.
        if dest not in nodes:
            graph.add_node(dest)
        # Add both source and dest to the nodes set.
        # Sets don't allow duplicates.
        nodes.add(source)
        nodes.add(dest)
        
        # Add the edge to the graph.
        graph.add_edge(source, dest, weight=weight)
    except (ValueError, IndexError):
        pass

pos=nx.spring_layout(graph)

# Draw the nodes and edges.
nx.draw_networkx_nodes(graph,pos, node_color='red', node_size=10, alpha=0.8)
nx.draw_networkx_edges(graph,pos,width=1.0,alpha=1)

# Show the plot.
plt.show()

	0	1	2	3	4	5	6	7	8	9	10	11	12	13
0	1	Goroka Airport	Goroka	Papua New Guinea	GKA	AYGA	-6.081689834590001	145.391998291	5282	10	U	Pacific/Port_Moresby	airport	OurAirports
1	2	Madang Airport	Madang	Papua New Guinea	MAG	AYMD	-5.20707988739	145.789001465	20	10	U	Pacific/Port_Moresby	airport	OurAirports
2	3	Mount Hagen Kagamuga Airport	Mount Hagen	Papua New Guinea	HGU	AYMH	-5.826789855957031	144.29600524902344	5388	10	U	Pacific/Port_Moresby	airport	OurAirports
3	4	Nadzab Airport	Nadzab	Papua New Guinea	LAE	AYNZ	-6.569803	146.725977	239	10	U	Pacific/Port_Moresby	airport	OurAirports
4	5	Port Moresby Jacksons International Airport	Port Moresby	Papua New Guinea	POM	AYPY	-9.443380355834961	147.22000122070312	146	10	U	Pacific/Port_Moresby	airport	OurAirports

	0	1	2	3	4	5	6	7	8	9	10	11
0	1	Goroka Airport	Goroka	Papua New Guinea	GKA	AYGA	-6.081689834590001	145.391998291	5282	10	U	Pacific/Port_Moresby
1	2	Madang Airport	Madang	Papua New Guinea	MAG	AYMD	-5.20707988739	145.789001465	20	10	U	Pacific/Port_Moresby
2	3	Mount Hagen Kagamuga Airport	Mount Hagen	Papua New Guinea	HGU	AYMH	-5.826789855957031	144.29600524902344	5388	10	U	Pacific/Port_Moresby
3	4	Nadzab Airport	Nadzab	Papua New Guinea	LAE	AYNZ	-6.569803	146.725977	239	10	U	Pacific/Port_Moresby
4	5	Port Moresby Jacksons International Airport	Port Moresby	Papua New Guinea	POM	AYPY	-9.443380355834961	147.22000122070312	146	10	U	Pacific/Port_Moresby

	id	name	city	country	code	icao	latitude	longitude	altitude	offset	dst	timezone
0	1	Goroka Airport	Goroka	Papua New Guinea	GKA	AYGA	-6.081689834590001	145.391998291	5282	10	U	Pacific/Port_Moresby
1	2	Madang Airport	Madang	Papua New Guinea	MAG	AYMD	-5.20707988739	145.789001465	20	10	U	Pacific/Port_Moresby
2	3	Mount Hagen Kagamuga Airport	Mount Hagen	Papua New Guinea	HGU	AYMH	-5.826789855957031	144.29600524902344	5388	10	U	Pacific/Port_Moresby
3	4	Nadzab Airport	Nadzab	Papua New Guinea	LAE	AYNZ	-6.569803	146.725977	239	10	U	Pacific/Port_Moresby
4	5	Port Moresby Jacksons International Airport	Port Moresby	Papua New Guinea	POM	AYPY	-9.443380355834961	147.22000122070312	146	10	U	Pacific/Port_Moresby

	id	name	alias	iata	icao	callsign	country	active
0	1	Private flight	\N	-	NaN	NaN	NaN	Y
1	2	135 Airways	\N	NaN	GNL	GENERAL	United States	N
2	3	1Time Airline	\N	1T	RNX	NEXTIME	South Africa	Y
3	4	2 Sqn No 1 Elementary Flying Training School	\N	NaN	WYT	NaN	United Kingdom	N
4	5	213 Flight Unit	\N	NaN	TFU	NaN	Russia	N

	airline	airline_id	source	source_id	dest	dest_id	codeshare	equipment
0	2B	410	AER	2965	KZN	2990	NaN	CR2
1	2B	410	ASF	2966	KZN	2990	NaN	CR2
2	2B	410	ASF	2966	MRV	2962	NaN	CR2
3	2B	410	CEK	2968	KZN	2990	NaN	CR2
4	2B	410	CEK	2968	OVB	4078	NaN	CR2

Class work

Tuesday, January 29, 2019

Seven Python Visulation Tools

Making a histogram¶

Using Seaborn¶

Bar Charts¶

Horizontal bar charts¶

Scatter plots¶

Static maps¶

Drawing great circles¶

Drawing network diagrams¶

	length	id
id
11806	7261.375734	11806
3201	6332.122085	3201
1008	5861.986606	1008
1299	4943.942048	1299
2183	4708.663933	2183

	length	id	name
0	7261.375734	11806	Compagnie Africaine d\\'Aviation
1	6332.122085	3201	LAN Argentina
2	5861.986606	1008	Albanian Airlines
3	4943.942048	1299	Arkefly
4	4708.663933	2183	Emirates