If using Google Colab, uncomment the following code block.
In [1]:
# %%capture
# !git clone https://github.com/predicthq/phq-data-science-docs.git
# %cd phq-data-science-docs/attended-events
# !pip install predicthq timezonefinder calmap==0.0.9
If running locally, set up a Python environment using requirements.txt
shared alongside the notebook to install the required dependencies.
In [2]:
from predicthq import Client
from timezonefinder import TimezoneFinder
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium.plugins import HeatMap
import numpy as np
import pandas as pd
from datetime import timedelta
import calmap
import rfc3339
import pytz
import calendar
import random
# To display more columns in the dataframe.
pd.set_option("display.max_columns", 50)
%matplotlib inline
SDK or CSV Data Access¶
This notebook can be run using the CSV example data provided, or if you have access to the PredictHQ API, you can use the code provided to call the API using our SDK for the locations of interest to you.
The starting point for Part 2 is the dataset created at the end of Part 1. As using the SDK is not the focus of Part 2, a function is created to call the SDK. For guidance on how to use the SDK, please refer to Part 1. If you do not have access to the SDK, the notebook also works with a number of CSV files that are provided alongside the notebook.
In [3]:
# Set whether to run with SDK or using provided CSV files.
# Set to be either "CSV" or "SDK".
RUN_SETTING = "CSV"
if RUN_SETTING == "SDK":
# Replace Access Token with own access token.
ACCESS_TOKEN = 'REPLACE_WITH_ACCESS_TOKEN'
phq = Client(access_token=ACCESS_TOKEN)
def query_attended_events(
start_time,
end_time,
radius,
radius_unit,
latitude,
longitude,
categories,
rank_type=None,
filter_parameter="gte",
rank_threshold="40",
):
"""
Query Attended Events based on time, location, category, and rank thresholds.
Args:
start_time: start of the period for querying Attended Events.
Format "YYYY-MM-DD"
end_time: end of the period for querying Attended Events.
Format of "YYYY-MM-DD"
radius: radius for querying Attended Events.
radius_unit: unit of the radius.
latitude: latitude of the interested location.
longitude: longitude of the interested location.
categories: list of categories, such as ["conferences", "expos", "concerts",
"festivals", "performing-arts", "sports", "community"].
rank_type: when it is not None, events are filtered according to rank_type,
filter_parameter and rank_threshold. The value could be either
"rank", "local_rank" or "aviation_rank".
filter_parameter: the value could be either "gte", "gt", "lte" or "lt".
Note that "lte" or "lt" is not commonly used in practice
since we are interested in events with large ranks.
rank_threshold: the value could be chosen between 0 and 100 depending on the
assumption of interested events' sizes.
return:
event_df: pandas DataFrame of Attended Events.
"""
within = f"{radius}{radius_unit}@{latitude},{longitude}"
timezone = TimezoneFinder().timezone_at(lat=latitude, lng=longitude)
params = {
"active__gte": start_time,
"active__lte": end_time,
"active__tz": timezone,
"within": within,
"category": categories,
"limit": 500,
}
if rank_type is not None:
params[f"{rank_type}__{filter_parameter}"] = rank_threshold
result_list = []
# Iterating through all the events that match our criteria and
# adding them to our result_list.
for event in phq.events.search(params).iter_all():
result_list.append(event.to_dict())
event_df = pd.DataFrame(result_list)
# Selecting the target fields.
event_df = event_df[
[
"id",
"title",
"category",
"description",
"duration",
"start",
"end",
"predicted_end",
"first_seen",
"labels",
"location",
"place_hierarchies",
"timezone",
"entities",
"phq_attendance",
"rank",
"local_rank",
"aviation_rank",
]
]
return event_df
Query Attended Events¶
In [4]:
start_time = "2019-01-01"
end_time = "2021-03-31"
radius = 10
radius_unit = "km"
latitude_store, longitude_store = (41.881832, -87.623177) # Chicago example
categories = [
"conferences",
"expos",
"concerts",
"festivals",
"performing-arts",
"sports",
"community",
]
file_name = (
f"data/event_data/radius{radius}{radius_unit}_{latitude_store}_"
+ f"{longitude_store}_{start_time}_{end_time}.csv"
)
if RUN_SETTING == "SDK":
event_df = query_attended_events(
start_time,
end_time,
radius,
radius_unit,
latitude_store,
longitude_store,
categories,
)
event_df.to_csv(file_name, index=False)
# Convert to string format.
event_df["start"] = event_df["start"].apply(lambda x: str(x))
event_df["end"] = event_df["end"].apply(lambda x: str(x))
event_df["first_seen"] = event_df["first_seen"].apply(lambda x: str(x))
event_df["entities"] = event_df["entities"].apply(lambda x: str(x))
elif RUN_SETTING == "CSV":
event_df = pd.read_csv(file_name)
# Convert string with python expression.
event_df["labels"] = event_df["labels"].apply(lambda x: eval(x))
event_df["location"] = event_df["location"].apply(lambda x: eval(x))
else:
print('Must set RUN_SETTING to either "SDK" or "CSV"')
event_df = event_df.sort_values("start")
event_df = event_df[
[
"id",
"title",
"category",
"description",
"duration",
"start",
"end",
"predicted_end",
"first_seen",
"labels",
"location",
"place_hierarchies",
"timezone",
"entities",
"phq_attendance",
"rank",
"local_rank",
"aviation_rank",
]
]
# Record the categories based on the fetched events.
categories_from_event_df = list(event_df["category"].unique())
event_df[:5].T
Out[4]:
38901 | 38900 | 38899 | 38898 | 38896 | |
---|---|---|---|---|---|
id | Qmhr4YY87oDYWBgg9P | dGwZXm7ACAusJMbAF6 | HdhQ8ZBoTsfgs7jNf5 | 8hm6ncUabT6QHXm2qY | 2DukUe4S6uXMTkaXum |
title | Navy Pier Winter Wonderfest: The UltimateTV Co... | New Year’s Eve at RPM Steak | Ring in the New Year at Hub 51 | Elf’d Up Holiday Pop-Up | New Year's Eve Dinner at Il Porcellino |
category | festivals | community | community | community | community |
description | The Ultimate TV and our good friends at Navy P... | Bid farewell to 2018 and ring in the New Year ... | Kick off the New Year at Hub 51, where holiday... | The Stretch Bar and Grill for the holidays is ... | Ring in 2019 at Il Porcellino with luxe holida... |
duration | 720000 | 50400 | 54000 | 43200 | 75600 |
start | 2018-12-29 16:00:00+00:00 | 2018-12-31 17:00:00+00:00 | 2018-12-31 17:00:00+00:00 | 2018-12-31 21:00:00+00:00 | 2018-12-31 22:00:00+00:00 |
end | 2019-01-07 00:00:00+00:00 | 2019-01-01 07:00:00+00:00 | 2019-01-01 08:00:00+00:00 | 2019-01-01 09:00:00+00:00 | 2019-01-01 19:00:00+00:00 |
predicted_end | NaN | NaN | NaN | NaN | NaN |
first_seen | 2018-12-31 08:25:26+00:00 | 2018-11-18 09:29:39+00:00 | 2018-11-18 08:19:38+00:00 | 2018-12-14 19:14:50+00:00 | 2018-11-24 03:43:43+00:00 |
labels | [entertainment, festival] | [community, food] | [community, food] | [community, food] | [community, food] |
location | [-87.607291, 41.89162] | [-87.630606, 41.889367] | [-87.629858, 41.889806] | [-87.655124, 41.945777] | [-87.6303, 41.88979] |
place_hierarchies | [['6295630', '6255149', '6252001', '4896861', ... | [['6295630', '6255149', '6252001', '4896861', ... | [['6295630', '6255149', '6252001', '4896861', ... | [['6295630', '6255149', '6252001', '4896861', ... | [['6295630', '6255149', '6252001', '4896861', ... |
timezone | America/Chicago | America/Chicago | America/Chicago | America/Chicago | America/Chicago |
entities | [] | [{'entity_id': '38mNe3yKj4ZptfGxKqY4GY4', 'nam... | [{'entity_id': '3237psvZvWhSJCMPF9PdUgm', 'nam... | [{'entity_id': 'x56rAXZkb29DA7j8NajQDE', 'name... | [{'entity_id': '4AhEiZH85jUPgZy4punF3H', 'name... |
phq_attendance | NaN | 38 | 38 | 11 | 38 |
rank | 0 | 22 | 22 | 11 | 22 |
local_rank | 0 | 30 | 29 | 19 | 29 |
aviation_rank | NaN | NaN | NaN | NaN | NaN |
Event Preprocessing¶
In [5]:
def get_local_dt(dt, tz):
"""
Localize a datetime for a given timezone.
Args:
dt: a datetime in string format.
tz: timezone to localise to.
return:
local_dt: local datetime according to given timezone.
"""
if not dt:
return None
dt = rfc3339.parse_datetime(dt)
if not tz:
return dt
local_dt = dt.astimezone(pytz.timezone(tz))
return local_dt
def update_end_time(row):
"""
Update end time if predicted_end is available.
"""
if isinstance(row["predicted_end"], str):
return row["predicted_end"]
else:
return row["end"]
# Convert to local date and local time of the start of the event.
event_df["start_local"] = event_df.apply(
lambda row: get_local_dt(row["start"], row["timezone"]), axis=1
)
event_df["start_date"] = event_df.start_local.map(lambda x: x.date())
# Convert to local date and local time at the end of the event.
event_df["end_update"] = event_df.apply(
lambda row: update_end_time(row),
axis=1,
)
event_df["end_local"] = event_df.apply(
lambda row: get_local_dt(row["end_update"], row["timezone"]), axis=1
)
event_df["end_date"] = event_df.end_local.map(lambda x: x.date())
# Convert to local date and local time when the event was seen for the first time.
event_df["first_seen_local"] = event_df.apply(
lambda row: get_local_dt(row["first_seen"], row["timezone"]), axis=1
)
event_df["first_seen_date"] = event_df.first_seen_local.map(lambda x: x.date())
# Duration of each event in days.
event_df["duration_day"] = event_df.apply(
lambda row: (row["end_date"] - row["start_date"]).days + 1, axis=1
)
# Duration of each event between the first seen time and start of the event in days.
event_df["day_from_first_seen"] = event_df.apply(
lambda row: (row["end_date"] - row["first_seen_date"]).days + 1, axis=1
)
Overview of Attended Events¶
Number of events per category¶
In [6]:
event_df_category = (
event_df.groupby(["category"]).count()[["id"]].rename(columns={"id": "count"})
)
fig = go.Figure(
data=[
go.Pie(
labels=event_df_category.index,
values=event_df_category["count"],
)
]
)
fig.update_traces(hoverinfo="label+percent", textinfo="value", textfont_size=20)
fig.show()