import pandas as pd 
import numpy as np 
from pandas import ExcelWriter
import xlrd

from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

np.random.seed(2018)

clfs = {
    "LogisticRegression": LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=200, multi_class='multinomial', ),
    "DecisionTreeClassifier": DecisionTreeClassifier(class_weight='balanced'),
    "MLPClassifier": MLPClassifier(solver='lbfgs', early_stopping=True),
    "LinearSVC": LinearSVC(class_weight='balanced'),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "RandomForestClassifier": RandomForestClassifier(class_weight='balanced'),
    "GaussianNB": GaussianNB()
}
# categories = {
#     1: "common",
#     2: "age",
#     3: "tumor",
#     4: "status",
#     5: "stage",
#     6: "histological"
# }
algos = {
    1: "icap",
    2: "cife",
    3: "cmim",
    4: "disr"
}
writer = ExcelWriter("../Results/classify/algos.xlsx") 
for algo in algos:
    print(algos[algo])
     
    xls = xlrd.open_workbook("../Results/comparison/{}.xlsx".format(algos[algo].upper()), on_demand=True)
    categories = list(xls.sheet_names())[:-1]
    res_df = pd.DataFrame(columns=list(clfs.keys()), index=list(categories))
    for category in categories:
        main_df = pd.read_csv('../data/R3/Level- 3/categorization_miRNA(RPMlog2)/{}/data.csv'.format(category.split('_')[1]), header=0)      
        gene_df = pd.read_excel("../Results/comparison/{}.xlsx".format(algos[algo].upper()), header=0)
        genes = gene_df.iloc[:, 0].values.tolist() 
        genes = genes + ['label']
        curr_df = main_df[main_df.columns.intersection(genes)]
        X = curr_df.values[:, 1:-1]
        y = curr_df.values[:, -1]
        tl = TomekLinks(return_indices=True, ratio='majority')
        X, y, id_tl = tl.fit_sample(X, y)
        print('\n\n{}'.format(category))
        for clf in clfs:
            # print(clf)
            clf1 = clfs[clf]
            scores = cross_val_score(clf1, X, y, cv=5)
            clf1.fit(X, y)
            # print("Accuracy: %0.2f\n" % (scores.mean()))
            print(category, clf)
            res_df.loc[category, clf] = scores.mean()
            print(res_df.head())
        print('===================================\n\n')
        res_df['Sum Score'] = res_df.sum(axis=1)
        res_df = res_df.sort_values('Sum Score', ascending=False)
    res_df.to_excel(writer, algos[algo])
writer.save()