Commit b569bc0d authored by Sofia Sysourka's avatar Sofia Sysourka

Update evaluation.py

parent f5c7bee8
from sklearn.externals import joblib
import os
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import auxiliary_functions
import json
......@@ -12,10 +12,9 @@ from data_analysis import manual_preprocess
def evaluate_design(url):
def evaluate_design(urls):
# Load models created for prediction
# Load models created for prediction (models are located in the file 'models' of the current workspace)
clf1 = joblib.load('models\\layer_00_KNNclf3.pkl')
clf2 = joblib.load('models\\layer_07_DTclf.pkl')
clf3 = joblib.load('models\\layer_09_DTclf.pkl')
......@@ -28,8 +27,8 @@ def evaluate_design(url):
for i in [0, 7, 9, 10, 12]:
layer = "layer_%02d" % i
selected_features = dataset[layer]["selected features"]
X_eval, y_eval = auxiliary_functions.create_data_array([url], layer, 'eval_data\\', target_names)
X_eval, y_eval = auxiliary_functions.create_data_array(urls, layer, 'eval_data\\', target_names)
# fill in missing values
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X_eval)
......@@ -45,6 +44,7 @@ def evaluate_design(url):
X_eval = X_eval[:, selected_features]
evaluation_data[layer] = {}
evaluation_data[layer]['X'] = X_eval
# Evaluate models on given data
y_pred1 = clf1.predict_proba(evaluation_data['layer_00']['X']) #probability of assignment to each class
......@@ -62,36 +62,55 @@ def evaluate_design(url):
# Aggregate results
w = np.array([[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]]) #class weights for each classifier
print w
res1 = np.multiply(y_pred1, w[1,:])
res2 = np.multiply(y_pred2, w[2,:])
res3 = np.multiply(y_pred3, w[3,:])
#class weights for each classifier
w = np.array([[0.6,0.8,0.7],[0.9,0.6,0.9],[0.8,0.8,1],[1,1,1],[1,0.8,0.6]])
# multiply results with given weights
res1 = np.multiply(y_pred1, w[0,:])
res2 = np.multiply(y_pred2, w[1,:])
res3 = np.multiply(y_pred3, w[2,:])
res4 = np.multiply(y_pred4, w[3,:])
res5 = np.multiply(y_pred5, w[3,:])
res5 = np.multiply(y_pred5, w[4,:])
result = np.sum([res1, res2, res3, res4, res5], axis=0) #sum results per class (i.e. per column)
y_pred = np.argmax(result)
# find mean value of results per class/category for each webpage
result = np.sum([res1, res2, res3, res4, res5], axis=0)
result = np.divide(result,5)
# find category with maximum probability
y_pred = np.argmax(result, axis=1)
print y_pred
print y_eval
conf_mat = confusion_matrix(y_eval, y_pred)
accuracy = []
for i, result in enumerate(conf_mat):
accuracy.append(float(result[i])/sum(result))
report = classification_report(y_eval, y_pred, target_names=target_names)
print (report)
print ("--ACCURACY--")
for i, target in enumerate(target_names):
print (target + " : " + str(accuracy[i]))
print ("Average accuracy : " + str(accuracy_score(y_eval, y_pred)))
if y_pred==y_eval:
print "SUCCESS!"
else:
print "FAILURE :("
# MAIN EVALUATION CODE
evaluation_urls = []
# TODO -> ADD MORE PAGES FOR EVALUATION
news = ['http://www.usatoday.com/', 'http://www.independent.ie/',
'http://www1.cbn.com/home', 'http://news.gc.ca/web/index-en.do',
'https://my.yahoo.com/', 'https://www.alarabiya.net/']
news = ['http://www.zougla.gr/', 'http://www.independent.ie/',
'http://www1.cbn.com/home', 'http://news.gc.ca/web/index-en.do']
shopping = ['https://www.walgreens.com/?experience=B', 'http://www.barnesandnoble.com/',
'https://www.wayfair.com/', 'https://www.humblebundle.com/star-wars-3-bundle',
'http://eu.wiley.com/WileyCDA/Section/index.html', 'http://www.staples.com/office/supplies/home']
shopping = []
search_engines = []
search_engines = ['http://isearch.babylon.com/', 'http://www.peekyou.com/',
'http://www.kiddle.co/', 'https://www.deepdyve.com/', 'http://hotbot.com/',
'http://pdfsb.net/']
print len(news)
print len(shopping)
......@@ -104,13 +123,11 @@ evaluation_urls.extend(search_engines)
categories = ["news"]*len(news)
categories.extend(["e-shopping"]*len(shopping))
categories.extend(["search engine"]*len(search_engines))
target_names = (np.unique(categories)).tolist() #names of the categories used in the analysis
target_names = (np.unique(categories)).tolist() # sorted names of the categories used in the analysis
metrics = ['density', 'equilibrium', 'balance', 'regularity', 'simplicity', 'homogeneity', 'alignment', 'proportion',
'cohesion', 'grouping', 'symmetry', 'rhythm']
# COLLECT DATA
eval_webpage_info = {}
......@@ -126,20 +143,19 @@ json_results = json.dumps(eval_webpage_info, indent = 4)
with open('eval_data\\eval_webpage_info.json', 'w') as f:
f.write(json_results)
# Load webpage info
with open('eval_data\\eval_webpage_info.json', 'r') as f:
eval_webpage_info = json.load(f)
# Calculate metrics
# CALCULATE METRICS
calc_metrics(eval_webpage_info, 'eval_data\\', metrics) #give file path to read data and write results
# Perform initial preprocessing
# PERMORM INITIAL PREPROCESSING
max_layer, metric_names = manual_preprocess(evaluation_urls, 'eval_data\\') #get evaluation data
eval_webpage_info["max_layer"] = max_layer
eval_webpage_info["metric_names_ordered"] = metric_names
print metric_names # change to alphabetic order
print metric_names
# Load system data to find selected features for each layer
......@@ -147,8 +163,6 @@ with open('system_data\\dataset.json', 'r') as f:
dataset = json.load(f)
# Evaluate each webpage
for url in evaluation_urls:
evaluate_design(url)
# PREDICT WEBPAGE CATEGORY
evaluate_design(evaluation_urls)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment