From 264c5e572efe7ee9cfb816d91e2419aec762c015 Mon Sep 17 00:00:00 2001 From: Matt Bayer Date: Sat, 4 May 2019 14:13:53 -0400 Subject: [PATCH 1/3] Changed number of candidates/clusters to prevent oversampling --- extractive/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractive/config.py b/extractive/config.py index 15795ef..512b29c 100644 --- a/extractive/config.py +++ b/extractive/config.py @@ -2,5 +2,5 @@ "minimum_samples": 2, "min_clusters": 50, "min_chars_per_sentence": 42, - "min_num_candidates": 500, + "min_num_candidates": 175, "max_acceptable_clusters": 100 } From ce446664f69d5a4997063e2b5035921e5adb513b Mon Sep 17 00:00:00 2001 From: Matt Bayer Date: Sat, 4 May 2019 15:05:57 -0400 Subject: [PATCH 2/3] New clustering hyperparameters --- config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config.py b/config.py index 60d12c5..0cf4331 100644 --- a/config.py +++ b/config.py @@ -14,11 +14,11 @@ 'autoencoder':None, 'ae_batchsize': 5000, - 'density_parameter': .04, - 'minimum_samples': 4, - 'min_clusters': 5, - 'max_acceptable_clusters':30, - 'min_num_candidates': 100, + 'density_parameter': 2, + 'minimum_samples': 2, + 'min_clusters': 50, + 'max_acceptable_clusters':100, + 'min_num_candidates': 200, 'BERT_finetune_path':'bert_finetune/models/finetune_electronics_mae1.pt', 'BERT_config_path': 'bert_finetune/models/finetune_electronics_mae1config.json', From 75ed2a629b8b9e5b9466823114f9bce9b2490597 Mon Sep 17 00:00:00 2001 From: Matt Bayer Date: Sat, 4 May 2019 15:14:53 -0400 Subject: [PATCH 3/3] you get a merge conflict and everyone gets a merge conflict --- extractive/config.py | 2 +- extractive/extractive.py | 64 ++++++++++++++++++++++++++++++++-------- extractive/helpers.py | 7 +++-- 3 files changed, 56 insertions(+), 17 deletions(-) diff --git a/extractive/config.py b/extractive/config.py index 512b29c..cd9a8f5 100644 --- a/extractive/config.py +++ b/extractive/config.py @@ -2,5 +2,5 @@ "minimum_samples": 2, "min_clusters": 50, "min_chars_per_sentence": 42, - "min_num_candidates": 175, + "min_num_candidates": 200, "max_acceptable_clusters": 100 } diff --git a/extractive/extractive.py b/extractive/extractive.py index fe96d03..93dd024 100644 --- a/extractive/extractive.py +++ b/extractive/extractive.py @@ -2,6 +2,7 @@ import pickle from config import config from helpers import find_clusters, sample, abstractive_clustering, cut_out_shorts +import json def extractive_encode(list_of_sentences, savePath = None): """Featurizes a list of sentences using the indico API""" @@ -40,38 +41,75 @@ def abstractive_cluster(features): if __name__ == "__main__": import numpy as np #using test data of most popular review from electronics - text = pickle.load(open("popular_electronics_sentences.p","rb")) - features = pickle.load(open("electronics_popular_features.p","rb")) - text, features = cut_out_shorts(text, features, config) + #text = pickle.load(open("popular_electronics_sentences.p","rb")) + #print("loading text") + #text = json.load(open("data.json","r")) + #print(np.shape(text["0"])) + #print("loading features") + data = pickle.load(open("sample_embeddings.p","rb")) + data = data["B002YU83YO"] + text = data["sentences"] + features = data["embeddings"] features = np.asarray(features) + text, features = cut_out_shorts(text, features, config) + print(np.shape(text)) + print(np.shape(features)) + #means = abstractive_cluster(features) + #print(np.shape(means)) - means = abstractive_cluster(features) - print(np.shape(means)) - + print("clustering...") sentence_labels, num_clusters = extractive_cluster(features) sentences = extractive_decode(text, sentence_labels, features, num_clusters, config) + print("Number of clusters: " + str(num_clusters)) print("Number of candidates: " + str(len(sentences))) - print(sentences[::len(sentences)//num_clusters]) + print(sentences[::len(sentences)//num_clusters]) def cluster(encodings, sentences, config): - if False: + if True: sentence_labels, num_clusters = find_clusters(encodings, config) candidate_sentences = sample(sentences, sentence_labels, encodings, num_clusters, config) return candidate_sentences else: - sentence_labels, _ = find_clusters(encodings, config) + sentence_labels, num_clusters = find_clusters(encodings, config) + print("Number of clusters: " + str(num_clusters)) means = [] for cluster in set(sentence_labels): + print("CLUSTER " + str(cluster) + "\n") if cluster == -1: - continue + pass + cluster_indices = np.where(sentence_labels == cluster) + for i in cluster_indices[0][:10]: + print(sentences[i]) + print("\n") cluster_core_samples = encodings[cluster_indices] average = np.mean(cluster_core_samples, axis = 0) means.append(average) + print(len(means)) return means - print(np.shape(cluster(features, [], config))) - print(type(cluster(features, [], config))) - print(type(cluster(features, [], config)[0])) \ No newline at end of file + #print(cluster(features, text, config)) + cluster(features, text, config) + #print(np.shape(cluster(features, text, config))) + #print(type(cluster(features, [], config))) + #print(type(cluster(features, [], config)[0])) + +""" +B000QUUFRW +B000JE7GPY +B000WL6YY8 +B003ES5ZUU +B002YU83YO +B008NMCPTQ +B003LSTD38 +B000WYVBR0 +B001GTT0VO +B0043WJRRS +B00902SFC4 +B00GTGETFG +B00007EDZG +B002TLTGM6 +B0088CJT4U +""" \ No newline at end of file diff --git a/extractive/helpers.py b/extractive/helpers.py index 72027f1..818bd2c 100644 --- a/extractive/helpers.py +++ b/extractive/helpers.py @@ -1,7 +1,7 @@ import numpy as np from math import ceil from sklearn.cluster import DBSCAN -from sklearn.metrics.pairwise import cosine_distances +from sklearn.metrics.pairwise import cosine_distances, euclidean_distances def cut_out_shorts(list_of_sentences, features, config): @@ -69,11 +69,12 @@ def sample(list_of_sentences, sentence_labels, features, num_clusters, config): cluster_indices = np.where(sentence_labels == cluster) cluster_core_samples = features[cluster_indices] average = np.mean(cluster_core_samples, axis = 0) - distances_from_cluster = cosine_distances(features, average.reshape(1,-1)) + distances_from_cluster = euclidean_distances(features, average.reshape(1,-1)) sample_sentence_indices = np.argsort(distances_from_cluster.flatten())[:samples_per_cluster] for sentence_index in sample_sentence_indices: candidates.append(list_of_sentences[sentence_index]) - + print(list_of_sentences[sentence_index]) + print("#######") return candidates