#imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random
from operator import itemgetter

import warnings
warnings.filterwarnings("ignore")


data = pd.read_csv("data.csv")
data.head()


print("Number of rows in the dataset:", data.shape[0])
print("Number of columns in the dataset:", data.shape[1])
print("There are {} unique manhwa/manga available".format(len(data["title"].unique())))

Number of rows in the dataset: 70948
Number of columns in the dataset: 6
There are 70939 unique manhwa/manga available


useful_data = data.drop(columns=["cover"])
useful_data = useful_data[useful_data["year"] >= 2000]
useful_data = useful_data.sort_values("year")


#dropping manwhas and manhuas
#useful_data = useful_data[useful_data["tags"] in "]

useful_data.head()


tag_column = useful_data["tags"]

for index,row in useful_data["tags"].iteritems():
    row = row[1:-1]
    row = row.replace("'", "").replace(" ", "")
    row = row.split(",")
    useful_data.at[index,"tags"] = row

weed = {}
for tag in useful_data["tags"]:
     for genre in tag:
        if (genre in weed):
            weed[genre] = weed[genre] + 1
        else:
            weed[genre] = 1
            
#print(weed)


N = 50
 
# N largest values in dictionary
# Using sorted() + itemgetter() + items()
weed = dict(sorted(weed.items(), key=itemgetter(1), reverse=True)[:N])
 
# printing result
print("The top N value pairs are " + str(weed))

The top N value pairs are {'Romance': 28058, 'Comedy': 19669, 'Drama': 17042, 'Fantasy': 15179, 'BL': 12347, 'SchoolLife': 11989, 'Action': 11317, 'Yaoi': 9996, 'FullColor': 9668, 'LightNovels': 9287, 'Webtoons': 8871, 'Seinen': 8170, 'SliceofLife': 7653, 'Supernatural': 7472, 'Manhwa': 6861, 'Shoujo': 6677, 'Shounen': 6369, 'Manhua': 5645, 'Adventure': 5471, 'Josei': 5002, 'OneShot': 4452, 'Ecchi': 3572, 'SciFi': 3415, 'ExplicitSex': 3370, 'Historical': 3220, 'PersoninaStrangeWorld': 3134, 'BasedonaWebNovel': 3040, 'Mystery': 3019, 'AdultCouples': 2863, 'Collections': 2751, 'Shounen-ai': 2351, 'BasedonaNovel': 2310, 'WebNovels': 2224, 'Non-HumanProtagonists': 2170, 'Psychological': 2165, 'BasedonaLightNovel': 2065, 'GL': 2010, 'MatureThemes': 2001, 'Horror': 1991, 'MatureRomance': 1898, '4-koma': 1818, 'AdaptedtoAnime': 1802, 'Magic': 1789, 'Isekai': 1788, 'Harem': 1659, 'Harlequin': 1639, 'Royalty': 1587, 'Smut': 1327, 'Shoujo-ai': 1251, 'Demons': 1130}


values_list = list(weed.values())

values_array = np.array(values_list)

keys_list = list(weed.keys())

keys_array = np.array(keys_list)

no_of_colors=len(keys_array)
color=["#"+''.join([random.choice('0123456789ABCDEF') for i in range(6)])
       for j in range(no_of_colors)]

bar_plot = pd.DataFrame({"tags":values_array},index = keys_array)
#bar_plot.plot.bar()

colors = color
plt.figure(figsize=(40, 20))
plt.bar(keys_array, values_array, color=colors,width = 1)
plt.xticks(rotation=30)
plt.title('The different types of genre', fontsize=20)
plt.xlabel('Different genres', fontsize=20)
plt.ylabel('#', fontsize=20)
plt.grid(True)
plt.show()


# one hot encode 
useful_data.head()
ohctags = {}

for tags in useful_data["tags"]:
    splt = tags
    for wtag in weed.keys():
        if (wtag not in ohctags.keys()):
            ohctags[wtag] = []
        if (wtag in splt):
            ohctags[wtag].append(1)
        else:
            ohctags[wtag].append(0)
            
ohctags_df = pd.DataFrame.from_dict(ohctags)
ohctags_df.head(10)


#combine both into one dataframe!

prep_df = useful_data
#prep_df = prep_df.reset_index()
prep_df = pd.concat([useful_data.reset_index(drop=True), ohctags_df.reset_index(drop=True)], axis=1)
prep_df.head()


from sklearn.neighbors import NearestNeighbors

# get all the features by themselves as a numpy array
X = ohctags_df.to_numpy()

# implement knn using scipi, find the first k nearest neighbors.
# ball tree is to make sure we dont calculate 67,000 distances 
# we choose 12 to avoid duplicates
knn = NearestNeighbors(n_neighbors=11, algorithm='ball_tree')
nbrs = knn.fit(X)


# code for grabbing a random sample - used for quick testing.
# rand = prep_df.sample()
# Y = (rand.drop(["title", "description", "rating", "year", "tags"], axis=1)).to_numpy()

# This function below prints a readable string representation of the first K reccomendations for Y
def knnrecs(Y):
    Ydat = (Y.drop(labels=["title", "description", "tags", "rating", "year"])).to_numpy()
    distances, indices = nbrs.kneighbors([Ydat])
    #print(indices)
    print("Best Reccomendations if You've Read: " + str(Y["title"]))
    print(str(Y["tags"]))
    print()
    i = 0
    for x in indices[0]:
        row = useful_data.iloc[x]
        title = row["title"]
        desc = row["description"]
        tags = row["tags"]
        if (title != Y["title"]):
            print(str(title))
            print("distance: " + str(distances[0][i]))
            print("tags: " + str(tags))
            print("description: " + str(desc))
            print()
        i += 1
    return(indices[0])


# Let's test this with one of Sharath's favorite mangas!
dsid = prep_df.index[prep_df["title"] == "The Eminence in Shadow"]
Y = prep_df.iloc[dsid[0]]

recs = knnrecs(Y)

Best Reccomendations if You've Read: The Eminence in Shadow
['Action', 'Comedy', 'Fantasy', 'Shounen', 'Isekai', 'Magic', 'MagicSchool', 'OverpoweredMainCharacters', 'Parody', 'PersoninaStrangeWorld', 'Reincarnation', 'SchoolLife', 'Swordplay', 'ExplicitViolence', 'AdaptedtoAnime', 'BasedonaLightNovel']

Villainess: Reloaded! Blowing Away Bad Ends with Modern Weapons
distance: 1.4142135623730951
tags: ['Action', 'Comedy', 'Fantasy', 'Shounen', 'Guns', 'Isekai', 'Magic', 'ModernKnowledge', 'OtomeGame', 'PersoninaStrangeWorld', 'Reincarnation', 'Villainess', 'Violence', 'BasedonaLightNovel']
description: Astrid von Oldenburg is no ordinary four-year-old. She’s a child prodigy with a passion for military technology who now finds herself reincarnated in the world of an otome game she played during her past life. But not as the heroine! As the game’s villainess, she's born with wealth, power, and a fearsome talent for magic. The only problem is that every route leads to her inevitable destruction. Or does it? What if averting her destruction was a simple matter of amassing enough firepower to annihilate anyone who dared even attempt to bring her down?! In a bid to resist fate, the young villainess embarks on the reproduction of all of her favorite weaponry. Whatever it takes, Astrid’s determined to blow away her bad ends with superior firepower!

Doryoku Shisugita Sekai Saikyou no Butouka wa, Mahou Sekai wo Yoyuu de Ikinuku.
distance: 1.4142135623730951
tags: ['Action', 'Comedy', 'Fantasy', 'Shounen', 'Isekai', 'Magic', 'MartialArts', 'OverpoweredMainCharacters', 'PersoninaStrangeWorld', 'Reincarnation', 'BasedonaLightNovel']
description: One day, a martial artist named Ash was suddenly reborn into another world. He decided that he will become a sorcerer in his second life.He went through harsh training after becoming the apprentice of the former hero, Morris. Then, the "Emperor of darkness" suddenly appeared! Right when the end of the world was approaching, he one-shotted the demon lord?!

Tsukimichi: Moonlit Fantasy
distance: 1.4142135623730951
tags: ['Action', 'Adventure', 'Comedy', 'Fantasy', 'Shounen', 'Isekai', 'KingdomBuilding', 'Magic', 'ModernKnowledge', 'OverpoweredMainCharacters', 'PersoninaStrangeWorld', 'RPG', 'SummonedIntoAnotherWorld', 'AdaptedtoAnime', 'BasedonaLightNovel']
description: Misumi Makoto was just a normal high-school student... until the day he was summoned to another world because of an agreement his parents had made with a goddess. Except when he meets the only goddess in the world he's sent to, she insults him by calling him "extremely unattractive" and makes an arbitrary decision to banish him to a deserted wilderness. Makoto searches the desolate land to find other humans, but for some reason only finds creatures that are all nonhuman. Even the two pretty women who decide to follow him on his journey used to be a dragon and a giant spider. Along with two extremely eccentric characters (but really reliable in battles), so begins Makoto's unlucky adventure in the new world! A fantasy about a teenage boy who keeps getting hit by one problem after another!

Didn't I Say to Make My Abilities Average in the Next Life?!
distance: 1.4142135623730951
tags: ['Action', 'Adventure', 'Comedy', 'Fantasy', 'Shounen', 'Hiatus', 'Isekai', 'Magic', 'Nobility', 'OverpoweredMainCharacters', 'PersoninaStrangeWorld', 'Reincarnation', 'AdaptedtoAnime', 'BasedonaLightNovel']
description: When she turns ten years old, Adele von Ascham is hit with a horrible headache–and memories of her previous life as an eighteen-year-old Japanese girl named Kurihara Misato. That life changed abruptly, however, when Misato died trying to aid a little girl and met god. During that meeting, she made an odd request and asked for average abilities in her next life. But few things–especially wishes–ever go quite as planned.

Ore no Ie ga Maryoku Spot Datta Ken: Sun de Iru Dake de Sekai Saikyou
distance: 1.7320508075688772
tags: ['Action', 'Comedy', 'Fantasy', 'Shounen', 'Dragons', 'Isekai', 'Magic', 'OverpoweredMainCharacters', 'PersoninaStrangeWorld', 'SummonedIntoAnotherWorld', 'ExplicitSex', 'BasedonaLightNovel']
description: Living carefree at home is the greatest shortcut---My House is the world's greatest Magic Power Spot, that being the case both my house and I were summoned to another world by some guys who are aiming for it. However, I've been living in this place for many years and my body is, apparently, abnormally overflowing with magic. Due to some unforeseen circumstances by those guys who summoned me, they quickly ran away. Be that as it may, there are some ill-mannered people who covet the magic leaking out of my house.

Twisted-Wonderland: The Comic - Episode of Heartslabyul
distance: 1.7320508075688772
tags: ['Comedy', 'Fantasy', 'Shounen', 'BoardingSchool', 'Disney', 'Isekai', 'Magic', 'MagicSchool', 'PersoninaStrangeWorld', 'SchoolLife', 'SummonedIntoAnotherWorld', 'BasedonaMobileGame']
description: Enma Yuuken is a high school student and member of kendo club. After an accident with a Black Carriage, he ends in Night Raven College, a pretigious magic school in Twisted Wonderland.

My Instant Death Ability Is So Overpowered, No One in This Other World Stands a Chance Against Me! ΑΩ
distance: 1.7320508075688772
tags: ['Action', 'Adventure', 'Comedy', 'Fantasy', 'Shounen', 'Cheats', 'Isekai', 'Magic', 'OverpoweredMainCharacters', 'PersoninaStrangeWorld', 'SummonedIntoAnotherWorld', 'Violence', 'BasedonaLightNovel']
description: Awaking to absolute chaos and carnage while on a school trip, Yogiri Takatou discovers that everyone in his class has been transported to another world! He had somehow managed to sleep through the entire ordeal himself, missing out on the Gift — powers bestowed upon the others by a mysterious Sage who appeared to transport them. Even worse, he and another classmate were ruthlessly abandoned by their friends, left as bait to distract a nearby dragon. Although not terribly bothered by the thought of dying, he reluctantly decides to protect his lone companion. After all, a lowly Level 1000 monster doesn't stand a chance against his secret power to invoke Instant Death with a single thought! If he can stay awake long enough to bother using it, that is...

Akashic Records of Bastard Magic Instructor
distance: 1.7320508075688772
tags: ['Action', 'Comedy', 'Ecchi', 'Fantasy', 'Shounen', 'Magic', 'MagicSchool', 'SchoolLife', 'Teaching', 'AdaptedtoAnime', 'BasedonaLightNovel']
description: Lumia and Sisti are mages-in-training at a prestigious magical academy where they hope to be taught by the best of the best. However, when their favorite instructor suddenly retires, his replacement turns out to be a total jerk - he's idle, incompetent, and always late! Can Lumia help uncover their new teacher's true potential - and can Sisti still learn magic and unravel the secrets of the mysterious Sky Castle with such a terrible mentor as her guide?

Yankee wa Isekai de Seirei ni Aisaremasu.
distance: 1.7320508075688772
tags: ['Action', 'Comedy', 'Fantasy', 'Delinquents', 'Isekai', 'Magic', 'PersoninaStrangeWorld', 'Reincarnation', 'BasedonaLightNovel']
description: While trying to save a child who was going to be run over by a truck, Manai Zero loses his life. When he wakes up even though he should have lost his life, Manai is given the choice of being reincarnated, but in a different world, with a power called "Beloved by Spirits". A world of magic and sprites, monsters and adventures.

Isekai Cheat Magician
distance: 1.7320508075688772
tags: ['Action', 'Adventure', 'Fantasy', 'Shounen', 'Cheats', 'Guilds', 'Isekai', 'Magic', 'OverpoweredMainCharacters', 'PersoninaStrangeWorld', 'SummonedIntoAnotherWorld', 'AdaptedtoAnime', 'BasedonaLightNovel']
description: As regular high school students Taichi and Rin disappeared in a beam of light. When they came to, the two of them were already in a world of swords and magic. Finally getting away after experiencing an attack by monsters, following the suggestion of adventurers they headed on the path towards the guild. In the guild, the two of them found out that they possessed unbelievably powerful magic. Thus the regular high school students transformed into the strongest cheats...


import re

#Sharath Kannan's List
skidx = []
skidx.append(prep_df.index[prep_df["title"] == "The Eminence in Shadow"][0])
skidx.append(prep_df.index[prep_df["title"] == "Oshi no Ko"][0])
skidx.append(prep_df.index[prep_df["title"] == "Tsukimichi: Moonlit Fantasy"][0])
skidx.append(prep_df.index[prep_df["title"] == "Chainsaw Man"][0])
skidx.append(prep_df.index[prep_df["title"] == "Overlord"][0])
skidx.append(prep_df.index[prep_df["title"] == "The Saga of Tanya the Evil"][0])
skidx.append(prep_df.index[prep_df["title"] == "Kaguya-sama: Love Is War"][0])
skidx.append(prep_df.index[prep_df["title"] == "Reincarnated as a Sword"][0])
skidx.append(prep_df.index[prep_df["title"] == "Black Clover"][0])
skidx.append(prep_df.index[prep_df["title"] == "Dragon Ball Super"][0])
skidx.append(prep_df.index[prep_df["title"] == "Re:ZERO -Starting Life in Another World- Chapter 1: A Day in the Capital"][0])
skidx.append(prep_df.index[prep_df["title"] == "Re:ZERO -Starting Life in Another World- Chapter 2: A Week at the Mansion"][0])
skidx.append(prep_df.index[prep_df["title"] == "Re:ZERO -Starting Life in Another World- Chapter 3: Truth of Zero"][0])
skidx.append(prep_df.index[prep_df["title"] == "Re:ZERO -Starting Life in Another World- Chapter 4: Sanctuary and Witch of Greed"][0])
skidx.append(prep_df.index[prep_df["title"] == "One-Punch Man (Webcomic)"][0])
skidx.append(prep_df.index[prep_df["title"] == "The Quintessential Quintuplets"][0])
skidx.append(prep_df.index[prep_df["title"] == "Classroom of the Elite"][0])
skidx.append(prep_df.index[prep_df["title"] == "Horimiya"][0])
skidx.append(prep_df.index[prep_df["title"] == "Spice and Wolf (Light Novel)"][0])
skidx.append(prep_df.index[prep_df["title"] == "86: Eighty-Six"][0])
skidx.append(prep_df.index[prep_df["title"] == "How a Realist Hero Rebuilt the Kingdom"][0])
skidx.append(prep_df.index[prep_df["title"] == "That Time I Got Reincarnated as a Slime: The Ways of the Monster Nation"][0])
skidx.append(prep_df.index[prep_df["title"] == "That Time I Got Reincarnated as a Slime: Trinity in Tempest"][0])
skidx.append(prep_df.index[prep_df["title"] == "The Rising of the Shield Hero"][0])
skidx.append(prep_df.index[prep_df["title"] == "Goblin Slayer"][0])

# fill out the rest of the dataset.
ld = np.ones(len(skidx))
random_seed = 615
np.random.seed(2129)
# fill rest with random values-0
i = 0
while (i < 35):
    r = np.random.randint(len(prep_df))
    if (r not in skidx):
        skidx = np.append(skidx, r)
        ld = np.append(ld, 0)
        i += 1

sksample = prep_df.iloc[skidx]
sksample["Like_Dislike"] = ld
sksample.head()


# Eric Cho's List
manhwa_data = []
manhwa_data.append(prep_df.index[prep_df["title"] == "Nano Machine"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Tower of God - Part 3"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "A Good Day to be a Dog"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Tower of God"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "A Business Proposal"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Positively Yours"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "What's Wrong with Secretary Kim?"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Hold Me Tight"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Weak Hero"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Bastard"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Teenage Mercenary"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Surviving Romance"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Homeless"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Walk on Water"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Second Life Ranker (Novel)"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Light and Shadow"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Odd Girl Out"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Love Shuttle"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Sweet Home"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Positively Yours"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Unholy Blood"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Medical Return"][0])
manhwa_data.append(prep_df.index[prep_df["title"] == "Who Made Me a Princess"][0])

ld = np.ones(len(manhwa_data))

random_seed = 615
np.random.seed(2193)


for i in range(35):
    ran = np.random.randint(len(prep_df))
    if(ran not in manhwa_data):
        manhwa_data = np.append(manhwa_data,ran)
        ld = np.append(ld,0)
        

manhwa_data = prep_df.iloc[manhwa_data]
manhwa_data["Like_Dislike"]=ld
manhwa_data.head()


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss

# set up Sharath's samples
skX = (sksample.drop(labels=["title", "description", "tags", "rating", "year", "Like_Dislike"], axis=1)).to_numpy()
skY = (sksample["Like_Dislike"]).to_numpy()

# make training/testing set
X_train, X_test, Y_train, Y_test = train_test_split(skX, skY, test_size=0.2, train_size=0.8, random_state=615, shuffle=True)


# we train 11 KNN models, starting with 1 and going up by 5 up to 50.
K = np.arange(0,40,1)
K[0] = 1

# find training and testing loss
tr_loss = []
te_loss = []
tr_acc = []
te_acc = []
for k in K:
    knn = KNeighborsClassifier(n_neighbors=k, algorithm="brute")
    knn = knn.fit(X_train, Y_train)
    # score returns mean accuracy of training/testing
    tr_acc.append(knn.score(X_train, Y_train))
    te_acc.append(knn.score(X_test, Y_test))
    tr_loss.append(log_loss(Y_train, knn.predict_proba(X_train)))
    te_loss.append(log_loss(Y_test, knn.predict_proba(X_test)))

# plot loss values
plt.title("K-value vs Loss (Sharath Kannan's Data)")
plt.xlabel("K-value")
plt.ylabel("Loss")

# might remove this
ax = plt.gca()
ax.set_ylim([0.0, 1.0])

plt.plot(K, te_loss, tr_loss)
plt.legend(["testing", "training"])
plt.show()


# Accuracy
plt.title("K-value vs Accuracy (Sharath Kannan's Data)")
plt.xlabel("K-value")
plt.ylabel("Accuracy")

# might remove this
ax = plt.gca()
ax.set_ylim([0.0, 1.0])

plt.plot(K, te_acc, tr_acc)
plt.legend(["testing", "training"])
plt.show()


# set up Erics Samples
ecX = (manhwa_data.drop(labels=["title", "description", "tags", "rating", "year", "Like_Dislike"], axis=1)).to_numpy()
ecY = (manhwa_data["Like_Dislike"]).to_numpy()

# make training/testing set
X_train, X_test, Y_train, Y_test = train_test_split(ecX, ecY, test_size=0.2, train_size=0.8, random_state=615, shuffle=True)

# we train 11 KNN models, starting with 1 and going up by 5 up to 50.
K = np.arange(0,40,1)
K[0] = 1

# find training and testing loss
tr_loss = []
te_loss = []
tr_acc = []
te_acc = []
for k in K:
    knn = KNeighborsClassifier(n_neighbors=k, algorithm="brute")
    knn = knn.fit(X_train, Y_train)
    # score returns mean accuracy of training/testing
    tr_acc.append(knn.score(X_train, Y_train))
    te_acc.append(knn.score(X_test, Y_test))
    tr_loss.append(log_loss(Y_train, knn.predict_proba(X_train)))
    te_loss.append(log_loss(Y_test, knn.predict_proba(X_test)))

# plot loss values
plt.title("K-value vs Loss (Eric Cho's Data)")
plt.xlabel("K-value")
plt.ylabel("Loss")

# might remove this
ax = plt.gca()
ax.set_ylim([0.0, 1.0])

plt.plot(K, te_loss, tr_loss)
plt.legend(["testing", "training"])
plt.show()


# Accuracy
plt.title("K-value vs Accuracy (Eric Cho's Data)")
plt.xlabel("K-value")
plt.ylabel("Accuracy")

# might remove this
ax = plt.gca()
ax.set_ylim([0.0, 1.0])

plt.plot(K, te_acc, tr_acc)
plt.legend(["testing", "training"])
plt.show()


# sk with 4
X_train, X_test, Y_train, Y_test = train_test_split(skX, skY, test_size=0.2, train_size=0.8, random_state=615, shuffle=True)
knn = KNeighborsClassifier(n_neighbors=3, algorithm="brute")
knn = knn.fit(X_train, Y_train)

print("Training score (Sharath): " + str(knn.score(X_train, Y_train)))
print("Testing score (Sharath): " + str(knn.score(X_test, Y_test)))
print()

#EC with 4
X_train, X_test, Y_train, Y_test = train_test_split(ecX, ecY, test_size=0.2, train_size=0.8, random_state=615, shuffle=True)
knn = KNeighborsClassifier(n_neighbors=3, algorithm="brute")
knn = knn.fit(X_train, Y_train)

print("Training score (Eric): " + str(knn.score(X_train, Y_train)))
print("Testing score (Eric): " + str(knn.score(X_test, Y_test)))

Training score (Sharath): 0.9166666666666666
Testing score (Sharath): 0.9166666666666666

Training score (Eric): 0.9565217391304348
Testing score (Eric): 1.0


from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score


skX = (sksample.drop(labels=["title", "description", "tags", "rating", "year","Like_Dislike"], axis=1)).to_numpy()
sky = (sksample["Like_Dislike"]).to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(skX, skY, test_size=0.2, train_size=0.8, random_state=615, shuffle=True)


# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,Y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Training Accuracy:", clf.score(X_train, Y_train))
print("Testing Accuracy:", clf.score(X_test, Y_test))

Training Accuracy: 1.0
Testing Accuracy: 1.0


from sklearn.tree import *
#import pydotplus
#from IPython.display import Image
#from six import StringIO

#dot_data = StringIO()
feature_cols = sksample.drop(labels=["title", "description", "tags", "rating", "year","Like_Dislike"], axis=1).columns.tolist()

plt.figure(figsize=(15,7))


plot_tree(clf, feature_names=feature_cols, class_names=["dislike", "like"])
#plot_tree(clf, feature_names=feature_cols)

[Text(0.5769230769230769, 0.9, 'AdaptedtoAnime <= 0.5\ngini = 0.478\nsamples = 48\nvalue = [29, 19]\nclass = dislike'),
 Text(0.3076923076923077, 0.7, 'BasedonaLightNovel <= 0.5\ngini = 0.219\nsamples = 32\nvalue = [28, 4]\nclass = dislike'),
 Text(0.15384615384615385, 0.5, 'MatureThemes <= 0.5\ngini = 0.069\nsamples = 28\nvalue = [27, 1]\nclass = dislike'),
 Text(0.07692307692307693, 0.3, 'gini = 0.0\nsamples = 27\nvalue = [27, 0]\nclass = dislike'),
 Text(0.23076923076923078, 0.3, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = like'),
 Text(0.46153846153846156, 0.5, 'Action <= 0.5\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = like'),
 Text(0.38461538461538464, 0.3, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = like'),
 Text(0.5384615384615384, 0.3, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = dislike'),
 Text(0.8461538461538461, 0.7, 'Action <= 0.5\ngini = 0.117\nsamples = 16\nvalue = [1, 15]\nclass = like'),
 Text(0.7692307692307693, 0.5, 'PersoninaStrangeWorld <= 0.5\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = like'),
 Text(0.6923076923076923, 0.3, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = like'),
 Text(0.8461538461538461, 0.3, 'Demons <= 0.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = dislike'),
 Text(0.7692307692307693, 0.1, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = dislike'),
 Text(0.9230769230769231, 0.1, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = like'),
 Text(0.9230769230769231, 0.5, 'gini = 0.0\nsamples = 10\nvalue = [0, 10]\nclass = like')]


# Set up with Erics Examples
ecX = (manhwa_data.drop(labels=["title", "description", "tags", "rating", "year", "Like_Dislike"], axis=1)).to_numpy()
ecY = (manhwa_data["Like_Dislike"]).to_numpy()

# make training/testing set
X_train, X_test, Y_train, Y_test = train_test_split(ecX, ecY, test_size=0.2, train_size=0.8, random_state=615, shuffle=True)
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,Y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Training Accuracy:", clf.score(X_train, Y_train))
print("Testing Accuracy:", clf.score(X_test, Y_test))

Training Accuracy: 1.0
Testing Accuracy: 1.0


feature_cols = manhwa_data.drop(labels=["title", "description", "tags", "rating", "year","Like_Dislike"], axis=1).columns.tolist()

plt.figure(figsize=(15,7))


plot_tree(clf, feature_names=feature_cols, class_names=["dislike", "like"])
#plot_tree(clf, feature_names=feature_cols)

[Text(0.4090909090909091, 0.9375, 'Manhwa <= 0.5\ngini = 0.476\nsamples = 46\nvalue = [28, 18]\nclass = dislike'),
 Text(0.3181818181818182, 0.8125, 'gini = 0.0\nsamples = 24\nvalue = [24, 0]\nclass = dislike'),
 Text(0.5, 0.8125, 'Shounen-ai <= 0.5\ngini = 0.298\nsamples = 22\nvalue = [4, 18]\nclass = like'),
 Text(0.4090909090909091, 0.6875, 'FullColor <= 0.5\ngini = 0.245\nsamples = 21\nvalue = [3, 18]\nclass = like'),
 Text(0.18181818181818182, 0.5625, 'Drama <= 0.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = dislike'),
 Text(0.09090909090909091, 0.4375, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = like'),
 Text(0.2727272727272727, 0.4375, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = dislike'),
 Text(0.6363636363636364, 0.5625, 'SciFi <= 0.5\ngini = 0.188\nsamples = 19\nvalue = [2, 17]\nclass = like'),
 Text(0.45454545454545453, 0.4375, 'ExplicitSex <= 0.5\ngini = 0.111\nsamples = 17\nvalue = [1, 16]\nclass = like'),
 Text(0.36363636363636365, 0.3125, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]\nclass = like'),
 Text(0.5454545454545454, 0.3125, 'Yaoi <= 0.5\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = like'),
 Text(0.45454545454545453, 0.1875, 'MatureRomance <= 0.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = dislike'),
 Text(0.36363636363636365, 0.0625, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = dislike'),
 Text(0.5454545454545454, 0.0625, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = like'),
 Text(0.6363636363636364, 0.1875, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = like'),
 Text(0.8181818181818182, 0.4375, 'Adventure <= 0.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = dislike'),
 Text(0.7272727272727273, 0.3125, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = dislike'),
 Text(0.9090909090909091, 0.3125, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = like'),
 Text(0.5909090909090909, 0.6875, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = dislike')]


from sklearn.neural_network import MLPClassifier

# set up Sharath's samples
skX = (sksample.drop(labels=["title", "description", "tags", "rating", "year", "Like_Dislike"], axis=1)).to_numpy()
skY = (sksample["Like_Dislike"]).to_numpy()

# make training/testing set
X_train, X_test, Y_train, Y_test = train_test_split(skX, skY, test_size=0.2, train_size=0.8, random_state=614, shuffle=True)

# initialize and train neural network
# activation function is ReLU-standard
# loss function is sgd
# learning rate has been set to 0.2 for training
nn = MLPClassifier(hidden_layer_sizes=(15,15,3), activation="relu", solver="sgd", shuffle=True, random_state=614, max_iter=600, verbose=True, learning_rate_init=0.2)
nn = nn.fit(X_train, Y_train)

# see scroes (mean error)
print("\nScores (Mean Accuracy)")
print("test score: " + str(nn.score(X_test, Y_test)))
print("training score: " + str(nn.score(X_train, Y_train)))

Iteration 1, loss = 0.66405658
Iteration 2, loss = 0.62816355
Iteration 3, loss = 0.58852031
Iteration 4, loss = 0.51603694
Iteration 5, loss = 0.43474858
Iteration 6, loss = 0.35646902
Iteration 7, loss = 0.29229925
Iteration 8, loss = 0.23382722
Iteration 9, loss = 0.18127073
Iteration 10, loss = 0.12635842
Iteration 11, loss = 0.08155672
Iteration 12, loss = 0.05028862
Iteration 13, loss = 0.03075169
Iteration 14, loss = 0.01758722
Iteration 15, loss = 0.01025202
Iteration 16, loss = 0.00635454
Iteration 17, loss = 0.00429061
Iteration 18, loss = 0.00298275
Iteration 19, loss = 0.00210838
Iteration 20, loss = 0.00150633
Iteration 21, loss = 0.00109183
Iteration 22, loss = 0.00080513
Iteration 23, loss = 0.00060697
Iteration 24, loss = 0.00046955
Iteration 25, loss = 0.00037554
Iteration 26, loss = 0.00030796
Iteration 27, loss = 0.00025825
Iteration 28, loss = 0.00022203
Iteration 29, loss = 0.00019517
Iteration 30, loss = 0.00017497
Iteration 31, loss = 0.00015953
Iteration 32, loss = 0.00014749
Iteration 33, loss = 0.00013794
Iteration 34, loss = 0.00013033
Iteration 35, loss = 0.00012419
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.

Scores (Mean Accuracy)
test score: 0.9166666666666666
training score: 1.0


# what if we doubled it!
nn = MLPClassifier(hidden_layer_sizes=(20,20,6), activation="relu", solver="sgd", shuffle=True, random_state=614, max_iter=600, verbose=True, learning_rate_init=0.2)
nn = nn.fit(X_train, Y_train)

# see scroes (mean error)
print("\nScores (Mean Accuracy)")
print("test score: " + str(nn.score(X_test, Y_test)))
print("training score: " + str(nn.score(X_train, Y_train)))

Iteration 1, loss = 0.69842120
Iteration 2, loss = 0.67927610
Iteration 3, loss = 0.65912810
Iteration 4, loss = 0.63718486
Iteration 5, loss = 0.61311979
Iteration 6, loss = 0.58339721
Iteration 7, loss = 0.54399954
Iteration 8, loss = 0.49506397
Iteration 9, loss = 0.44645408
Iteration 10, loss = 0.40350928
Iteration 11, loss = 0.35974043
Iteration 12, loss = 0.31400745
Iteration 13, loss = 0.26929240
Iteration 14, loss = 0.23055452
Iteration 15, loss = 0.19526259
Iteration 16, loss = 0.16177237
Iteration 17, loss = 0.13005466
Iteration 18, loss = 0.10026079
Iteration 19, loss = 0.07391577
Iteration 20, loss = 0.05362966
Iteration 21, loss = 0.03919814
Iteration 22, loss = 0.02823829
Iteration 23, loss = 0.02007347
Iteration 24, loss = 0.01460553
Iteration 25, loss = 0.01083800
Iteration 26, loss = 0.00808354
Iteration 27, loss = 0.00607342
Iteration 28, loss = 0.00460708
Iteration 29, loss = 0.00353540
Iteration 30, loss = 0.00275041
Iteration 31, loss = 0.00217171
Iteration 32, loss = 0.00174375
Iteration 33, loss = 0.00142549
Iteration 34, loss = 0.00118880
Iteration 35, loss = 0.00100719
Iteration 36, loss = 0.00086580
Iteration 37, loss = 0.00075457
Iteration 38, loss = 0.00066595
Iteration 39, loss = 0.00059460
Iteration 40, loss = 0.00053661
Iteration 41, loss = 0.00048903
Iteration 42, loss = 0.00044958
Iteration 43, loss = 0.00041635
Iteration 44, loss = 0.00038842
Iteration 45, loss = 0.00036480
Iteration 46, loss = 0.00034438
Iteration 47, loss = 0.00032683
Iteration 48, loss = 0.00031169
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.

Scores (Mean Accuracy)
test score: 1.0
training score: 1.0


from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Confusion matrix for Sharath's data
pred = nn.predict(X_test)
cm = confusion_matrix(Y_test, pred)

cm_display = ConfusionMatrixDisplay(cm).plot(cmap="pink")


ecX = (manhwa_data.drop(labels=["title", "description", "tags", "rating", "year", "Like_Dislike"], axis=1)).to_numpy()
ecY = (manhwa_data["Like_Dislike"]).to_numpy()

# make training/testing set
X_train, X_test, Y_train, Y_test = train_test_split(ecX, ecY, test_size=0.2, train_size=0.8, random_state=615, shuffle=True)

nn = MLPClassifier(hidden_layer_sizes=(20,20,6), activation="relu", solver="sgd", shuffle=True, random_state=614, max_iter=600, verbose=True, learning_rate_init=0.2)
nn = nn.fit(X_train, Y_train)

# see scroes (mean error)
print("\nScores (Mean Accuracy)")
print("test score: " + str(nn.score(X_test, Y_test)))
print("training score: " + str(nn.score(X_train, Y_train)))

Iteration 1, loss = 0.68933202
Iteration 2, loss = 0.66470290
Iteration 3, loss = 0.63535866
Iteration 4, loss = 0.60549905
Iteration 5, loss = 0.57569762
Iteration 6, loss = 0.53604651
Iteration 7, loss = 0.47560466
Iteration 8, loss = 0.39678589
Iteration 9, loss = 0.32356104
Iteration 10, loss = 0.26112361
Iteration 11, loss = 0.20836340
Iteration 12, loss = 0.17205988
Iteration 13, loss = 0.14783393
Iteration 14, loss = 0.13021786
Iteration 15, loss = 0.11168324
Iteration 16, loss = 0.09012525
Iteration 17, loss = 0.07342062
Iteration 18, loss = 0.06490571
Iteration 19, loss = 0.05699885
Iteration 20, loss = 0.04935083
Iteration 21, loss = 0.04064668
Iteration 22, loss = 0.02847394
Iteration 23, loss = 0.01560167
Iteration 24, loss = 0.00860033
Iteration 25, loss = 0.00520570
Iteration 26, loss = 0.00332491
Iteration 27, loss = 0.00230007
Iteration 28, loss = 0.00174464
Iteration 29, loss = 0.00139190
Iteration 30, loss = 0.00111719
Iteration 31, loss = 0.00087734
Iteration 32, loss = 0.00067983
Iteration 33, loss = 0.00052999
Iteration 34, loss = 0.00042258
Iteration 35, loss = 0.00035213
Iteration 36, loss = 0.00030207
Iteration 37, loss = 0.00026559
Iteration 38, loss = 0.00023819
Iteration 39, loss = 0.00021713
Iteration 40, loss = 0.00020050
Iteration 41, loss = 0.00018709
Iteration 42, loss = 0.00017611
Iteration 43, loss = 0.00016701
Iteration 44, loss = 0.00015942
Iteration 45, loss = 0.00015305
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.

Scores (Mean Accuracy)
test score: 1.0
training score: 1.0


# confusion matrix for Erics data
pred = nn.predict(X_test)
cm = confusion_matrix(Y_test, pred)

cm_display = ConfusionMatrixDisplay(cm).plot(cmap="pink")

	title	description	rating	year	tags	cover
0	Salad Days (Tang LiuZang) - Part 2	The second season of Salad Days (Tang LiuZang).	4.7	2021.0	['BL', 'Manhua', 'Romance', 'Shounen-ai', 'Spo...	https://cdn.anime-planet.com/manga/primary/sal...
1	The Master of Diabolism	As the grandmaster who founded the Demonic Sec...	4.7	2017.0	['Action', 'Adventure', 'BL', 'Comedy', 'Manhu...	https://cdn.anime-planet.com/manga/primary/the...
2	JoJo's Bizarre Adventure Part 7: Steel Ball Run	Set in 1890, Steel Ball Run spotlights Gyro Ze...	4.7	2004.0	['Action', 'Adventure', 'Horror', 'Mystery', '...	https://cdn.anime-planet.com/manga/primary/joj...
3	A Sign of Affection	Yuki is a typical college student, whose world...	4.7	2019.0	['Romance', 'Shoujo', 'Slice of Life', 'Disabi...	https://cdn.anime-planet.com/manga/primary/a-s...
4	Moriarty the Patriot	Before he was Sherlock’s rival, Moriarty fough...	4.7	2016.0	['Mystery', 'Shounen', 'Detectives', 'England'...	https://cdn.anime-planet.com/manga/primary/mor...

	title	description	rating	year	tags
60501	Copy Cat	This entry currently doesn't have a synopsis. ...	NaN	2000.0	['BL', 'Drama', 'Slice of Life', 'Yaoi']
30068	Sonna no Koi ja Nai	Chiaki found out that her boyfriend had anothe...	NaN	2000.0	['Drama', 'Romance', 'Shoujo', 'Collections']
57167	Maihime Terpsichore	Late into the school year, Sudo Kumi transfers...	NaN	2000.0	['Drama', 'Josei', 'Ballet Dancing', 'Dancing']
59215	Almost Paradise (Debbie MACOMBER)	This entry currently doesn't have a synopsis. ...	NaN	2000.0	['Josei', 'Romance', 'Harlequin', 'Based on a ...
46087	Easy Writer	Monica has finally gotten her dream job as a r...	NaN	2000.0	['Comedy', 'Drama', 'Josei', 'Romance', 'Slice...

Recommending Your Favorite Manwhas, Mangas, Manhuas with a Machine Learning Approach¶

Introduction¶

Recommender System¶

Content-based filtering¶

Collaborative filtering¶

Hybrid-based filtering¶

Dataset¶

Feature Engineering and Data Analysis¶

Using K-NN as the backbone of our Reccomendation System¶

Evaluating Recommendations¶

Finding the Best Model¶

K-Nearest Neighbors¶

Decision Trees¶

Neural Network¶

Conclusion¶

Works Cited¶

	Romance	Comedy	Drama	BL	Action	Yaoi	...	AdaptedtoAnime	Harlequin
0	0	0	1	1	0	1	...	0	0
1	1	0	1	0	0	0	...	0	0
2	0	0	1	0	0	0	...	0	0
3	1	0	0	0	0	0	...	0	1
4	1	1	1	0	0	0	...	0	0
5	1	1	0	0	0	0	...	0	0
6	0	0	1	0	0	0	...	1	0
7	1	0	0	1	0	1	...	0	0
8	1	1	1	0	1	0	...	0	0
9	0	0	0	0	0	0	...	0	0

	title	description	rating	year	tags	Comedy	Drama	Fantasy	...	AdaptedtoAnime	Magic	Isekai	Demons	Like_Dislike
43740	The Eminence in Shadow	Shadowbrokers are those who go unnoticed, posi...	4.6	2018.0	[Action, Comedy, Fantasy, Shounen, Isekai, Mag...	1	0	1	...	1	1	1	0	1.0
57732	Oshi no Ko	In the entertainment industry, lying is both y...	4.6	2020.0	[Drama, Seinen, SliceofLife, Acting, Idols, Ps...	0	1	0	...	0	0	0	0	1.0
30544	Tsukimichi: Moonlit Fantasy	Misumi Makoto was just a normal high-school st...	4.5	2015.0	[Action, Adventure, Comedy, Fantasy, Shounen, ...	1	0	1	...	1	1	1	0	1.0
43592	Chainsaw Man	Denji's life of poverty is changed forever whe...	4.6	2018.0	[Action, Fantasy, Horror, Shounen, DarkFantasy...	0	0	1	...	1	0	0	1	1.0
28152	Overlord	What do you do when your favorite game shuts d...	4.6	2014.0	[Action, Adventure, Fantasy, DarkFantasy, Demo...	0	0	1	...	1	1	1	1	1.0

	title	description	rating	year	tags	Romance	Comedy	Drama	Fantasy	...	AdaptedtoAnime	Like_Dislike
57872	Nano Machine	After being held in disdain and having his lif...	4.7	2020.0	[Action, Adventure, Fantasy, Manhwa, SciFi, We...	0	0	0	1	...	0	1.0
51958	Tower of God - Part 3	The third season of Tower of God.	4.7	2019.0	[Action, Adventure, Drama, Fantasy, Manhwa, We...	0	0	1	1	...	0	1.0
42249	A Good Day to be a Dog	Hana is cursed into a dog from her first kiss,...	4.6	2017.0	[Comedy, Drama, Manhwa, Romance, Webtoons, Adu...	1	1	1	0	...	0	1.0
16076	Tower of God	Fame. Glory. Power. Anything in your wildest d...	4.6	2010.0	[Action, Adventure, Drama, Fantasy, Manhwa, We...	0	0	1	1	...	1	1.0
43833	A Business Proposal	Ha-ri made a deal—go on one blind date for her...	4.6	2018.0	[Comedy, Drama, Manhwa, Romance, Webtoons, Adu...	1	1	1	0	...	0	1.0

	Romance	Comedy	Drama	BL	Action	Yaoi	...	AdaptedtoAnime	Harlequin
0	0	0	1	1	0	1	...	0	0
1	1	0	1	0	0	0	...	0	0
2	0	0	1	0	0	0	...	0	0
3	1	0	0	0	0	0	...	0	1
4	1	1	1	0	0	0	...	0	0
5	1	1	0	0	0	0	...	0	0
6	0	0	1	0	0	0	...	1	0
7	1	0	0	1	0	1	...	0	0
8	1	1	1	0	1	0	...	0	0
9	0	0	0	0	0	0	...	0	0

	Romance	Comedy	Drama	BL	Action	Yaoi	...	AdaptedtoAnime	Harlequin
0	0	0	1	1	0	1	...	0	0
1	1	0	1	0	0	0	...	0	0
2	0	0	1	0	0	0	...	0	0
3	1	0	0	0	0	0	...	0	1
4	1	1	1	0	0	0	...	0	0
5	1	1	0	0	0	0	...	0	0
6	0	0	1	0	0	0	...	1	0
7	1	0	0	1	0	1	...	0	0
8	1	1	1	0	1	0	...	0	0
9	0	0	0	0	0	0	...	0	0