import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from sklearn.neighbors import NearestNeighbors


ratings = pd.read_csv("beer_profile_and_ratings.csv")
# Because no one likes spaces
mappings = {
     'Name': 'name', 
     'Style': 'style', 
     'Brewery': 'brewery', 
     'Beer Name (Full)': 'full_name', 
     'Description': 'description', 
     'ABV': 'abv',
     'Min IBU': 'min_ibu', 
     'Max IBU': 'max_ibu', 
     'Astringency': 'astringency', 
     'Body': 'body', 
     'Alcohol': 'alcohol', 
     'Bitter': 'bitter',       
     'Sweet': 'sweet', 
     'Sour': 'sour', 
     'Salty': 'salty', 
     'Fruits': 'fruits', 
     'Hoppy': 'hoppy', 
     'Spices': 'spices', 
     'Malty': 'malty',     
     'review_aroma': 'review_aroma', 
     'review_appearance': 'review_appearance', 
     'review_palate': 'review_palate', 
     'review_taste': 'review_taste',       
     'review_overall': 'review_overall', 
     'number_of_reviews': 'number_of_reviews'
}
ratings.rename(columns=mappings, inplace=True)
ratings_numerical = ratings.drop(["name", "style", "brewery", "full_name", "description"], axis=1)
# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(ratings_numerical)


distortions = []
inertias = []
K = range(1, 100)
for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X)
    kmeanModel.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X.shape[0])
    inertias.append(kmeanModel.inertia_)


plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()

plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()


# Building and fitting the model
kmeanModel = KMeans(n_clusters=20).fit(X)
kmeanModel.fit(X)
ratings["cluster"] = kmeanModel.labels_


ratings["cluster"].value_counts()

1     350
7     325
16    234
15    224
6     217
14    216
11    211
5     180
19    172
18    162
3     148
10    118
12    104
2     104
13     99
4      89
8      84
9      80
17     67
0      13
Name: cluster, dtype: int64


ratings[ratings["cluster"]==17]["style"].unique()

array(['Brown Ale - Belgian Dark', 'Chile Beer', 'Cream Ale',
       'Fruit and Field Beer', 'Herb and Spice Beer', 'Lager - Rauchbier',
       'Pale Ale - Belgian', 'Porter - English', 'Pumpkin Beer',
       'Rye Beer', 'Smoked Beer', 'Stout - Foreign / Export',
       'Strong Ale - English', 'Wheat Beer - American Pale',
       'Wheat Beer - Dunkelweizen', 'Winter Warmer'], dtype=object)


def cluster_ratio(liked_beers: pd.DataFrame):
    counts = liked_beers["cluster"].value_counts()
    return dict(counts.apply(lambda x: x/counts.sum()))


def get_recommendation(liked_beers: pd.DataFrame):
    
    # Get cluster ratio of liked
    ratio = cluster_ratio(liked_beers)
    max_cluster = list(ratio.items())[0][0]
    cluster_percent = list(ratio.items())[0][1]
    
    # Get the beers from liked that are in max cluster
    liked_in_cluster = liked_beers[liked_beers["cluster"]==max_cluster]
    # Get cluster df
    total_in_cluster = ratings[ratings["cluster"]==max_cluster]

    # Remove those liked_beers from cluster
    excluded = total_in_cluster[~total_in_cluster["name"].isin(liked_in_cluster["name"])]
    
    # Calculate the centroid of liked beers within cluster and find most similar to excluded
    cluster_centroid = liked_in_cluster.drop(["name", "style", "brewery", "full_name", "description"], axis=1).sum()/len(liked_in_cluster)
    
    nearest = excluded.drop(["name", "style", "brewery", "full_name", "description"], axis=1)

    distances = (nearest.sub(cluster_centroid)**2).sum(axis=1)
    recommended = ratings.loc[distances.idxmin()]
    return recommended


sample = ratings.loc[range(1,5)]    
sample


rec = get_recommendation(sample)
print(rec["name"], rec["style"], rec["brewery"], sep=" ||| ")

Hofbräu Maibock (Urbock) ||| Bock - Maibock ||| Hofbräuhaus München

	name	style	brewery	full_name	description	abv	min_ibu	max_ibu	astringency	body	...	hoppy	spices	malty	review_aroma	review_appearance	review_palate	review_taste	review_overall	number_of_reviews	cluster
1	Double Bag	Altbier	Long Trail Brewing Co.	Long Trail Brewing Co. Double Bag	Notes:This malty, full-bodied double alt is al...	7.2	25	50	12	57	...	35	12	84	3.798337	3.846154	3.904366	4.024948	4.034304	481	1
2	Long Trail Ale	Altbier	Long Trail Brewing Co.	Long Trail Brewing Co. Long Trail Ale	Notes:Long Trail Ale is a full-bodied amber al...	5.0	25	50	14	37	...	54	4	62	3.409814	3.667109	3.600796	3.631300	3.830239	377	7
3	Doppelsticke	Altbier	Uerige Obergärige Hausbrauerei GmbH / Zum Uerige	Uerige Obergärige Hausbrauerei GmbH / Zum Ueri...	Notes:	8.5	25	50	13	55	...	40	16	119	4.148098	4.033967	4.150815	4.205163	4.005435	368	19
4	Sleigh'r Dark Doüble Alt Ale	Altbier	Ninkasi Brewing Company	Ninkasi Brewing Company Sleigh'r Dark Doüble A...	Notes:Called 'Dark Double Alt' on the label.Se...	7.2	25	50	25	51	...	51	20	95	3.625000	3.973958	3.734375	3.765625	3.817708	96	1

Perform Elbow method to get a solid value for k¶

Around 15 may be best, however I'm going with a few extra to have a little more variation due to how the recommendation will be performed¶

Pretty solid here, cluster 0 will likely not be recommended but the rest have a decent amount in each¶

This function will return the ratio of consumers liked beers. Used to determine which cluster to use when making rec¶

The actual rec function, will be deployed up to AWS lambda to be called via API¶

Quickly check it actually works..¶