Tuesday, January 29, 2019

Seven Python Visulation Tools

Seven Python Visulation tools
In [1]:
# Import the pandas library.
import pandas
# Read in the airports data.
airports = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\airports.csv", header=None, dtype=str)
In [2]:
airports.head()
Out[2]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 1 Goroka Airport Goroka Papua New Guinea GKA AYGA -6.081689834590001 145.391998291 5282 10 U Pacific/Port_Moresby airport OurAirports
1 2 Madang Airport Madang Papua New Guinea MAG AYMD -5.20707988739 145.789001465 20 10 U Pacific/Port_Moresby airport OurAirports
2 3 Mount Hagen Kagamuga Airport Mount Hagen Papua New Guinea HGU AYMH -5.826789855957031 144.29600524902344 5388 10 U Pacific/Port_Moresby airport OurAirports
3 4 Nadzab Airport Nadzab Papua New Guinea LAE AYNZ -6.569803 146.725977 239 10 U Pacific/Port_Moresby airport OurAirports
4 5 Port Moresby Jacksons International Airport Port Moresby Papua New Guinea POM AYPY -9.443380355834961 147.22000122070312 146 10 U Pacific/Port_Moresby airport OurAirports
In [3]:
airports=airports.iloc[:,0:12]
In [4]:
airports.head()
Out[4]:
0 1 2 3 4 5 6 7 8 9 10 11
0 1 Goroka Airport Goroka Papua New Guinea GKA AYGA -6.081689834590001 145.391998291 5282 10 U Pacific/Port_Moresby
1 2 Madang Airport Madang Papua New Guinea MAG AYMD -5.20707988739 145.789001465 20 10 U Pacific/Port_Moresby
2 3 Mount Hagen Kagamuga Airport Mount Hagen Papua New Guinea HGU AYMH -5.826789855957031 144.29600524902344 5388 10 U Pacific/Port_Moresby
3 4 Nadzab Airport Nadzab Papua New Guinea LAE AYNZ -6.569803 146.725977 239 10 U Pacific/Port_Moresby
4 5 Port Moresby Jacksons International Airport Port Moresby Papua New Guinea POM AYPY -9.443380355834961 147.22000122070312 146 10 U Pacific/Port_Moresby
In [5]:
airports.columns = ["id", "name", "city", "country", "code", "icao", "latitude", "longitude", "altitude", "offset", "dst", "timezone"]
In [6]:
airports.head()
Out[6]:
id name city country code icao latitude longitude altitude offset dst timezone
0 1 Goroka Airport Goroka Papua New Guinea GKA AYGA -6.081689834590001 145.391998291 5282 10 U Pacific/Port_Moresby
1 2 Madang Airport Madang Papua New Guinea MAG AYMD -5.20707988739 145.789001465 20 10 U Pacific/Port_Moresby
2 3 Mount Hagen Kagamuga Airport Mount Hagen Papua New Guinea HGU AYMH -5.826789855957031 144.29600524902344 5388 10 U Pacific/Port_Moresby
3 4 Nadzab Airport Nadzab Papua New Guinea LAE AYNZ -6.569803 146.725977 239 10 U Pacific/Port_Moresby
4 5 Port Moresby Jacksons International Airport Port Moresby Papua New Guinea POM AYPY -9.443380355834961 147.22000122070312 146 10 U Pacific/Port_Moresby
In [7]:
# Read in the airlines data.
airlines = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\airlines.csv", header=None, dtype=str, skiprows=1)
airlines.columns = ["id", "name", "alias", "iata", "icao", "callsign", "country", "active"]
In [8]:
airlines.head()
Out[8]:
id name alias iata icao callsign country active
0 1 Private flight \N - NaN NaN NaN Y
1 2 135 Airways \N NaN GNL GENERAL United States N
2 3 1Time Airline \N 1T RNX NEXTIME South Africa Y
3 4 2 Sqn No 1 Elementary Flying Training School \N NaN WYT NaN United Kingdom N
4 5 213 Flight Unit \N NaN TFU NaN Russia N
In [9]:
# Read in the routes data.
routes = pandas.read_csv("C:\\Users\\user\\Desktop\\Data Visulatization\\routes.csv", header=None, dtype=str)
routes.columns = ["airline", "airline_id", "source", "source_id", "dest", "dest_id", "codeshare", "stops", "equipment"]
In [10]:
routes.head()
Out[10]:
airline airline_id source source_id dest dest_id codeshare stops equipment
0 2B 410 AER 2965 KZN 2990 NaN 0 CR2
1 2B 410 ASF 2966 KZN 2990 NaN 0 CR2
2 2B 410 ASF 2966 MRV 2962 NaN 0 CR2
3 2B 410 CEK 2968 KZN 2990 NaN 0 CR2
4 2B 410 CEK 2968 OVB 4078 NaN 0 CR2
In [11]:
routes = routes[routes["airline_id"] != "\\N"]

Making a histogram

In [13]:
import math

def haversine(lon1, lat1, lon2, lat2):
    # Convert coordinates to floats.
    lon1, lat1, lon2, lat2 = [float(lon1), float(lat1), float(lon2), float(lat2)]
    # Convert to radians from degrees.
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    # Compute distance.
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 
    km = 6367 * c
    return km
In [14]:
def calc_dist(row):
    dist = 0
    try:
        # Match source and destination to get coordinates.
        source = airports[airports["id"] == row["source_id"]].iloc[0]
        dest = airports[airports["id"] == row["dest_id"]].iloc[0]
        # Use coordinates to compute distance.
        dist = haversine(dest["longitude"], dest["latitude"], source["longitude"], source["latitude"])
    except (ValueError, IndexError):
        pass
    return dist
In [15]:
route_lengths = routes.apply(calc_dist, axis=1)
In [16]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.hist(route_lengths, bins=20)
Out[16]:
(array([2.3102e+04, 1.9736e+04, 9.9470e+03, 5.2980e+03, 2.5900e+03,
        1.0630e+03, 8.3900e+02, 1.0170e+03, 9.1200e+02, 7.7100e+02,
        6.4900e+02, 5.5300e+02, 2.4700e+02, 2.3200e+02, 1.4600e+02,
        4.4000e+01, 3.2000e+01, 2.0000e+00, 0.0000e+00, 4.0000e+00]),
 array([    0.        ,   803.60790186,  1607.21580372,  2410.82370558,
         3214.43160744,  4018.0395093 ,  4821.64741116,  5625.25531302,
         6428.86321489,  7232.47111675,  8036.07901861,  8839.68692047,
         9643.29482233, 10446.90272419, 11250.51062605, 12054.11852791,
        12857.72642977, 13661.33433163, 14464.94223349, 15268.55013535,
        16072.15803721]),
 <a list of 20 Patch objects>)

Using Seaborn

In [17]:
import seaborn
seaborn.distplot(route_lengths, bins=20)
C:\Users\user\Anaconda3\New folder\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x16848518>

Bar Charts

In [18]:
# from solution
import numpy

# Put relevant columns into a dataframe.
route_length_df = pandas.DataFrame({"length": route_lengths, "id": routes["airline_id"]})
route_length_df.head(2)
Out[18]:
id length
0 410 1505.879552
1 410 1039.785086
In [19]:
#from solution
# Compute the mean route length per airline.
airline_route_lengths = route_length_df.groupby("id").aggregate(numpy.mean)
# Sort by length so we can make a better chart.
#airline_route_lengths = airline_route_lengths.sort("length", ascending=False)
airline_route_lengths = airline_route_lengths.sort_values("length", ascending=False)
airline_route_lengths.head(2)
Out[19]:
length
id
11806 7261.375734
3201 6332.122085
In [20]:
#from solution
#Simple Matplotlib
plt.bar(range(airline_route_lengths.shape[0]), airline_route_lengths["length"])
Out[20]:
<BarContainer object of 547 artists>
In [21]:
#from solution
def lookup_name(row):
    try:
        # Match the row id to the id in the airlines dataframe so we can get the name.
        name = airlines["name"][airlines["id"] == row["id"]].iloc[0]
    except (ValueError, IndexError):
        name = ""
    return name

# Add the index (the airline ids) as a column.
airline_route_lengths["id"] = airline_route_lengths.index.copy()
airline_route_lengths.head()
Out[21]:
length id
id
11806 7261.375734 11806
3201 6332.122085 3201
1008 5861.986606 1008
1299 4943.942048 1299
2183 4708.663933 2183
In [22]:
#from solution
# Find all the airline names.
airline_route_lengths["name"] = airline_route_lengths.apply(lookup_name, axis=1)

airline_route_lengths.head()
Out[22]:
length id name
id
11806 7261.375734 11806 Compagnie Africaine d\\'Aviation
3201 6332.122085 3201 LAN Argentina
1008 5861.986606 1008 Albanian Airlines
1299 4943.942048 1299 Arkefly
2183 4708.663933 2183 Emirates
In [23]:
#from solution
# Remove duplicate values in the index.
airline_route_lengths.index = range(airline_route_lengths.shape[0])
In [24]:
#from solution
airline_route_lengths.head()
Out[24]:
length id name
0 7261.375734 11806 Compagnie Africaine d\\'Aviation
1 6332.122085 3201 LAN Argentina
2 5861.986606 1008 Albanian Airlines
3 4943.942048 1299 Arkefly
4 4708.663933 2183 Emirates
In [25]:
#from solution
import numpy as np
from bokeh.io import output_notebook, show
#from bokeh.charts import Bar, show
from bokeh.plotting import figure

output_notebook()

#airline_route_lengths2 = airline_route_lengths[:10]

airline_name = airline_route_lengths.name
avg_route_length = airline_route_lengths.length

p = figure(x_range=airline_name, plot_height=450, plot_width=1200, title="Airline Route Lengths",
           toolbar_location=None, tools="")

p.vbar(x=airline_name, top=avg_route_length, width=1.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
#p.xaxis.major_label_orientation = "vertical"
p.xaxis.major_label_orientation = math.pi/4


show(p)
Loading BokehJS ...
In [26]:
# from solution
#showing only the few top routes to eliminate the clutter

output_notebook()

airline_route_lengths10 = airline_route_lengths[:20]

airline_name10 = airline_route_lengths10.name
avg_route_length10 = airline_route_lengths10.length

p10 = figure(x_range=airline_name10, plot_height=450, plot_width=800, title="Airline Route Lengths to 10",
            toolbar_location="below")

p10.vbar(x=airline_name10, top=avg_route_length, width=1.9)

p10.xgrid.grid_line_color = None
p10.y_range.start = 0
#p.xaxis.major_label_orientation = "vertical"
p10.xaxis.major_label_orientation = math.pi/4


show(p10)
Loading BokehJS ...
BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('top', 547), ('x', 20)

Horizontal bar charts

In [28]:
long_routes = len([k for k in route_lengths if k > 10000]) / len(route_lengths)
medium_routes = len([k for k in route_lengths if k < 10000 and k > 2000]) / len(route_lengths)
short_routes = len([k for k in route_lengths if k < 2000]) / len(route_lengths)
In [29]:
import pygal
from IPython.display import SVG
import os
chart = pygal.HorizontalBar()
chart.title = 'Long, medium, and short routes'
chart.add('Long', long_routes * 100)
chart.add('Medium', medium_routes * 100)
chart.add('Short', short_routes * 100)
chart.render_to_file('routes.svg')
SVG(filename='routes.svg')
Out[29]:
Long, medium, and short routes01020304050607073.00547749343.80000000000007401.9743589743590626.09401048131.3796590153445257.00000000000010.900512026717.300695550983022112.02564102564105Long, medium, and short routesLongMediumShort

Scatter plots

In [30]:
name_lengths = airlines["name"].apply(lambda x: len(str(x)))
plt.scatter(airlines["id"].astype(int), name_lengths)
Out[30]:
<matplotlib.collections.PathCollection at 0x16351a58>
In [31]:
data = pandas.DataFrame({"lengths": name_lengths, "ids": airlines["id"].astype(int)})
seaborn.jointplot(x="ids", y="lengths", data=data)
C:\Users\user\Anaconda3\New folder\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[31]:
<seaborn.axisgrid.JointGrid at 0x14db0c50>

Static maps

In [31]:
# Import the basemap package
from mpl_toolkits.basemap import Basemap
In [32]:
# Create a map on which to draw.  We're using a mercator projection, and showing the whole world.
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
# Draw coastlines, and the edges of the map.
m.drawcoastlines()
m.drawmapboundary()
# Convert latitude and longitude to x and y coordinates
x, y = m(list(airports["longitude"].astype(float)), list(airports["latitude"].astype(float)))
# Use matplotlib to draw the points onto the map.
m.scatter(x,y,1,marker='o',color='red')
# Show the plot.
plt.show()
C:\Users\user\Anaconda3\New folder\lib\site-packages\mpl_toolkits\basemap\__init__.py:1708: MatplotlibDeprecationWarning: The axesPatch function was deprecated in version 2.1. Use Axes.patch instead.
  limb = ax.axesPatch
C:\Users\user\Anaconda3\New folder\lib\site-packages\mpl_toolkits\basemap\__init__.py:1711: MatplotlibDeprecationWarning: The axesPatch function was deprecated in version 2.1. Use Axes.patch instead.
  if limb is not ax.axesPatch:

Drawing great circles

In [33]:
import folium
from folium import plugins
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [34]:
airports_map = folium.Map(location=[30, 0], zoom_start=2)
airports_map
Out[34]:
In [35]:
from string import printable
st = set(printable)
airports["name"] = airports["name"].apply(lambda x: ''.join([" " if  i not in  st else i for i in x]))
In [36]:
# Randomly sample 7 elements from dataframe
airports_R = airports.sample(n=10)
In [37]:
for index, row in airports_R.iterrows():
    if row["name"] != "South Pole Station":
        folium.CircleMarker([float(row['latitude']), float(row['longitude'])],
                        radius=2,
                        popup=row['name'],
                        fill_color="red", # divvy color
                       ).add_to(airports_map)
airports_map
Out[37]:
In [38]:
# Make a base map with a mercator projection.  Draw the coastlines.
# Import the basemap package
from mpl_toolkits.basemap import Basemap
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
m.drawcoastlines()

# Iterate through the first 3000 rows.
for name, row in routes[:3000].iterrows():
    try:
        # Get the source and dest airports.
        source = airports[airports["id"] == row["source_id"]].iloc[0]
        dest = airports[airports["id"] == row["dest_id"]].iloc[0]
        # Don't draw overly long routes.
        if abs(float(source["longitude"]) - float(dest["longitude"])) < 90:
            # Draw a great circle between source and dest airports.
            m.drawgreatcircle(float(source["longitude"]), float(source["latitude"]), float(dest["longitude"]), float(dest["latitude"]),linewidth=1,color='b')
    except (ValueError, IndexError):
        pass
    
# Show the map.
plt.show()

Drawing network diagrams

In [39]:
# Initialize the weights dictionary.
weights = {}
# Keep track of keys that have been added once -- we only want edges with a weight of more than 1 to keep our network size manageable.
added_keys = []
# Iterate through each route.
for name, row in routes.iterrows():
    # Extract the source and dest airport ids.
    source = row["source_id"]
    dest = row["dest_id"]
    
    # Create a key for the weights dictionary.
    # This corresponds to one edge, and has the start and end of the route.
    key = "{0}_{1}".format(source, dest)
    # If the key is already in weights, increment the weight.
    if key in weights:
        weights[key] += 1
    # If the key is in added keys, initialize the key in the weights dictionary, with a weight of 2.
    elif key in added_keys:
        weights[key] = 2
    # If the key isn't in added_keys yet, append it.
    # This ensures that we aren't adding edges with a weight of 1.
    else:
        added_keys.append(key)
In [40]:
# Import networkx and initialize the graph.
import networkx as nx
graph = nx.Graph()
# Keep track of added nodes in this set so we don't add twice.
nodes = set()
# Iterate through each edge.
for k, weight in weights.items():
    try:
        # Split the source and dest ids and convert to integers.
        source, dest = k.split("_")
        source, dest = [int(source), int(dest)]
        # Add the source if it isn't in the nodes.
        if source not in nodes:
            graph.add_node(source)
        # Add the dest if it isn't in the nodes.
        if dest not in nodes:
            graph.add_node(dest)
        # Add both source and dest to the nodes set.
        # Sets don't allow duplicates.
        nodes.add(source)
        nodes.add(dest)
        
        # Add the edge to the graph.
        graph.add_edge(source, dest, weight=weight)
    except (ValueError, IndexError):
        pass

pos=nx.spring_layout(graph)

# Draw the nodes and edges.
nx.draw_networkx_nodes(graph,pos, node_color='red', node_size=10, alpha=0.8)
nx.draw_networkx_edges(graph,pos,width=1.0,alpha=1)

# Show the plot.
plt.show()
style="text-align: left;" trbidi="on">