import pandas as pd 
import numpy as np 
from pandas import ExcelWriter

from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

np.random.seed(2018)

clfs = {
    # "LogisticRegression": LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=200, multi_class='multinomial', ),
    # "DecisionTreeClassifier": DecisionTreeClassifier(class_weight='balanced'),
    # "MLPClassifier": MLPClassifier(solver='lbfgs', early_stopping=True),
    "SVC": SVC(kernel='poly', class_weight='balanced'),
    # "KNeighborsClassifier": KNeighborsClassifier(),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    # "GaussianNB": GaussianNB()
}
categories = {
    # 1: "common",
    2: "age",
    3: "tumor",
    4: "status",
    5: "stage",
    6: "histological"
}
algos = {
    1: "efs",
    # 2: "disr",
    # 3: "cife",
    # 4: "icap",
    # 5: "adaptive",
    # 6: "average"
}

writer = ExcelWriter("../Results/classify/classify_efs.xlsx")  
for clf in clfs:
    res_df = pd.DataFrame(columns=list(algos.values()), index=list(categories.values()))

    for category in categories:
        main_df = pd.read_csv('../data/R3/Level- 3/categorization_miRNA(RPMlog2)/{}/data.csv'.format(categories[category]), header=0)
        # res_df = pd.DataFrame(columns=clfs.keys(), index=['all_miRNAs'] + list(algos.values()) + ['34_miRNAs'])
        
        # print(main_df.head())

        for algo in algos:
            genes = [line.rstrip('\n') for line in open('../data/list/{}/{}.txt'.format(algos[algo], categories[category]))]
            genes = genes + ['label']
            curr_df = main_df[main_df.columns.intersection(genes)]
            print(curr_df.head())
            X = curr_df.values[:, 1:-1]
            y = curr_df.values[:, -1]
            tl = TomekLinks(return_indices=True, ratio='majority')
            # X, y, id_tl = tl.fit_sample(X, y)
            print(clf)
            clf1 = clfs[clf]
            scores = cross_val_score(clf1, X, y, cv=5)
            clf1.fit(X, y)
            print("Accuracy: %0.2f\n" % (scores.mean()))
            res_df.ix[categories[category], algos[algo]] = scores.mean()
            print('===================================\n\n')

            # res_df['Sum Score'] = res_df.sum(axis=1)
            # res_df = res_df.sort_values('Sum Score', ascending=False)
    res_df.to_excel(writer, clf)

writer.save()