Guided Project - Build a Web App using Streamlit to show NYC collision data
Hassan Abbas
Software Design Engineer | LangChain | FastAPI | Flask | Vue | Quarkus | AI/ML
Introduction:
This article is about a guided hands-on project on coursera for building data science web app with the Streamlit library in Python. This is the first time that I am using streamlit to create a web app.
Initial Setup
import streamlit as s
import urllib3
import certifi
import json
import pandas as pd
import numpy as np
import pydeck as pdk
import plotly.express as pxt
If you are doing this project on your own computer and not using the cloud workspace provided by coursera you will have to install Streamlit library on your System.
By writing this in the terminal.
pip install streamlit
Now, write this in terminal to test if the library is installed properly.
streamlit hello
An app will open in your browser as shown in figure below.
I will be using API Endpoint instead of the CSV file.
API Endpoint: https://data.cityofnewyork.us/resource/h9gi-nx95.json
# data_url = 'D:\python projects\Collision Data using Python and Streamlit\\Motor_Vehicle_Collisions_-_Crashes.csv
data_url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json'
Funtion to load_data:
@st.cache(persist=True
def load_data(nrows):
# certify the source of data returns 200 if OK!
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.where())
url = data_url
req = http.request('GET', url) # get data from the API
print('Status of request: ', req.status)
# load the json file (creates a list of dicts)
dct = json.loads(req.data.decode('utf-8'))
# creating dataframe from list of dicts
df = pd.json_normalize(dct)
# Nothing much just some data manipulation happening here
df['crash_date_time'] = df['crash_date'].str.slice(0, 10) + " " + df['crash_time']
df['crash_date_time'] = pd.to_datetime(df['crash_date_time'], format='%Y-%m-%d %H:%M')
# del df['crash_date']
# del df['crash_time']
df.drop(['crash_date', 'crash_time'], inplace=True, axis=1)
df[['number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured',
'number_of_pedestrians_killed', 'number_of_cyclist_injured',
'number_of_cyclist_killed', 'number_of_motorist_injured',
'number_of_motorist_killed'
]] = df[['number_of_persons_injured',
'number_of_persons_killed', 'number_of_pedestrians_injured',
'number_of_pedestrians_killed', 'number_of_cyclist_injured',
'number_of_cyclist_killed', 'number_of_motorist_injured',
'number_of_motorist_killed'
]].astype(int)
df[['latitude', 'longitude']] = df[['latitude', 'longitude']].astype(float)
return df[:nrows]
)
Web App layout and Graphs:
data = load_data(10000)
original_data = data
st.title('motor vehicle collisions in nyc'.title())
st.markdown(f'''
This is a dashboard that can be used to analyze
motor vehicle collisions in nyc ??????
''')
st.header("Where are the most people injured in nyc?"
injured_people = st.slider('Number Of People Injured In Vehicle ', 0, 19)
st.map(data.query('number_of_persons_injured >= @injured_people')[["latitude", "longitude"]].dropna(how="any"))
st.header("how many collisions occur during given time of the day?".title())
hour = st.selectbox("hour to look at", range(0, 24), 1)
data = data[data['crash_date_time'].dt.hour == hour]
st.markdown("vehicle collisions between %i:00 and %i:00" % (hour % 24, (hour + 1) % 24))
midpoint = [data['latitude'].mean(skipna=True), data['longitude'].mean(skipna=True)]
st.write(pdk.Deck(
map_style='mapbox://styles/mapbox/light-v10',
initial_view_state={
"latitude": midpoint[0],
"longitude": midpoint[1],
"zoom": 11,
"pitch": 50,
},
layers=[
pdk.Layer(
"HexagonLayer",
data=data[['crash_date_time', 'latitude', 'longitude']].dropna(),
get_position=['longitude', 'latitude'],
radius=100,
extruded=True,
pickable=True,
elevation_Scale=4,
elevation_range=[0, 1000],
),
]
))
st.subheader('breakdown by minute between %i:00 and %i:00' % (hour, (hour + 1) % 24))
filtered = data[
(data['crash_date_time'].dt.hour >= hour) & (data['crash_date_time'].dt.hour < (hour + 1))
]
hist = np.histogram(filtered['crash_date_time'].dt.minute, bins=60, range=(0, 60))[0]
chart_data = pd.DataFrame({'minute': range(60), 'crashes': hist})
fig = px.bar(chart_data, x='minute', y='crashes', hover_data=['minute', 'crashes'], height=400)
st.write(fig)
st.header("top 5 dangerous streets in nyc by affected type".title())
select = st.selectbox('affected type of people', ["pedestrians", "cyclists", "motorists"])
if select == "pedestrians":
st.write(original_data.query("number_of_pedestrians_injured >=1")
[["on_street_name", "number_of_pedestrians_injured"]].sort_values(
by=['number_of_pedestrians_injured'], ascending=False).dropna(how='any')[:5])
if select == "cyclists":
st.write(original_data.query("number_of_cyclist_injured >=1")
[["on_street_name", "number_of_cyclist_injured"]].sort_values(
by=['number_of_cyclist_injured'], ascending=False).dropna(how='any')[:5])
elif select == "motorists":
st.write(original_data.query("number_of_motorist_injured >=1")
[["on_street_name", "number_of_motorist_injured"]].sort_values(
by=['number_of_motorist_injured'], ascending=False).dropna(how='any')[:5])
if st.checkbox('Show raw data', False):
st.subheader('raw data')
st.write(original_data)
)
Refrences:
LVER at New York State Department of Labor
9 个月Excellent job on your Coursera Guided Project!