import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from sklearn.neighbors import NearestNeighbors
ratings = pd.read_csv("beer_profile_and_ratings.csv")
# Because no one likes spaces
mappings = {
'Name': 'name',
'Style': 'style',
'Brewery': 'brewery',
'Beer Name (Full)': 'full_name',
'Description': 'description',
'ABV': 'abv',
'Min IBU': 'min_ibu',
'Max IBU': 'max_ibu',
'Astringency': 'astringency',
'Body': 'body',
'Alcohol': 'alcohol',
'Bitter': 'bitter',
'Sweet': 'sweet',
'Sour': 'sour',
'Salty': 'salty',
'Fruits': 'fruits',
'Hoppy': 'hoppy',
'Spices': 'spices',
'Malty': 'malty',
'review_aroma': 'review_aroma',
'review_appearance': 'review_appearance',
'review_palate': 'review_palate',
'review_taste': 'review_taste',
'review_overall': 'review_overall',
'number_of_reviews': 'number_of_reviews'
}
ratings.rename(columns=mappings, inplace=True)
ratings_numerical = ratings.drop(["name", "style", "brewery", "full_name", "description"], axis=1)
# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(ratings_numerical)
distortions = []
inertias = []
K = range(1, 100)
for k in K:
# Building and fitting the model
kmeanModel = KMeans(n_clusters=k).fit(X)
kmeanModel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_,
'euclidean'), axis=1)) / X.shape[0])
inertias.append(kmeanModel.inertia_)
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()
# Building and fitting the model
kmeanModel = KMeans(n_clusters=20).fit(X)
kmeanModel.fit(X)
ratings["cluster"] = kmeanModel.labels_
ratings["cluster"].value_counts()
1 350 7 325 16 234 15 224 6 217 14 216 11 211 5 180 19 172 18 162 3 148 10 118 12 104 2 104 13 99 4 89 8 84 9 80 17 67 0 13 Name: cluster, dtype: int64
ratings[ratings["cluster"]==17]["style"].unique()
array(['Brown Ale - Belgian Dark', 'Chile Beer', 'Cream Ale', 'Fruit and Field Beer', 'Herb and Spice Beer', 'Lager - Rauchbier', 'Pale Ale - Belgian', 'Porter - English', 'Pumpkin Beer', 'Rye Beer', 'Smoked Beer', 'Stout - Foreign / Export', 'Strong Ale - English', 'Wheat Beer - American Pale', 'Wheat Beer - Dunkelweizen', 'Winter Warmer'], dtype=object)
def cluster_ratio(liked_beers: pd.DataFrame):
counts = liked_beers["cluster"].value_counts()
return dict(counts.apply(lambda x: x/counts.sum()))
def get_recommendation(liked_beers: pd.DataFrame):
# Get cluster ratio of liked
ratio = cluster_ratio(liked_beers)
max_cluster = list(ratio.items())[0][0]
cluster_percent = list(ratio.items())[0][1]
# Get the beers from liked that are in max cluster
liked_in_cluster = liked_beers[liked_beers["cluster"]==max_cluster]
# Get cluster df
total_in_cluster = ratings[ratings["cluster"]==max_cluster]
# Remove those liked_beers from cluster
excluded = total_in_cluster[~total_in_cluster["name"].isin(liked_in_cluster["name"])]
# Calculate the centroid of liked beers within cluster and find most similar to excluded
cluster_centroid = liked_in_cluster.drop(["name", "style", "brewery", "full_name", "description"], axis=1).sum()/len(liked_in_cluster)
nearest = excluded.drop(["name", "style", "brewery", "full_name", "description"], axis=1)
distances = (nearest.sub(cluster_centroid)**2).sum(axis=1)
recommended = ratings.loc[distances.idxmin()]
return recommended
sample = ratings.loc[range(1,5)]
sample
name | style | brewery | full_name | description | abv | min_ibu | max_ibu | astringency | body | ... | hoppy | spices | malty | review_aroma | review_appearance | review_palate | review_taste | review_overall | number_of_reviews | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Double Bag | Altbier | Long Trail Brewing Co. | Long Trail Brewing Co. Double Bag | Notes:This malty, full-bodied double alt is al... | 7.2 | 25 | 50 | 12 | 57 | ... | 35 | 12 | 84 | 3.798337 | 3.846154 | 3.904366 | 4.024948 | 4.034304 | 481 | 1 |
2 | Long Trail Ale | Altbier | Long Trail Brewing Co. | Long Trail Brewing Co. Long Trail Ale | Notes:Long Trail Ale is a full-bodied amber al... | 5.0 | 25 | 50 | 14 | 37 | ... | 54 | 4 | 62 | 3.409814 | 3.667109 | 3.600796 | 3.631300 | 3.830239 | 377 | 7 |
3 | Doppelsticke | Altbier | Uerige Obergärige Hausbrauerei GmbH / Zum Uerige | Uerige Obergärige Hausbrauerei GmbH / Zum Ueri... | Notes: | 8.5 | 25 | 50 | 13 | 55 | ... | 40 | 16 | 119 | 4.148098 | 4.033967 | 4.150815 | 4.205163 | 4.005435 | 368 | 19 |
4 | Sleigh'r Dark Doüble Alt Ale | Altbier | Ninkasi Brewing Company | Ninkasi Brewing Company Sleigh'r Dark Doüble A... | Notes:Called 'Dark Double Alt' on the label.Se... | 7.2 | 25 | 50 | 25 | 51 | ... | 51 | 20 | 95 | 3.625000 | 3.973958 | 3.734375 | 3.765625 | 3.817708 | 96 | 1 |
4 rows × 26 columns
rec = get_recommendation(sample)
print(rec["name"], rec["style"], rec["brewery"], sep=" ||| ")
Hofbräu Maibock (Urbock) ||| Bock - Maibock ||| Hofbräuhaus München