Ethan C.

asked • 07/08/21

K Means Clustering: Cluster Centers/Centroid

Trying to find the cluster centriods (in bold) for this coding below, would attach the annotations.csv file but the file is too large.

Input:

#Set Up Cell

#regex - text processing

import re

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

nltk.download('punkt')

from nltk.stem import PorterStemmer

import string

from sklearn.manifold import TSNE

#Bag of Words

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#read file

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

#mount my drive

from google.colab import drive

drive.mount("/content/drive")

#Import Data

# filepaths

# filepath = '/content/drive/MyDrive/annotations.csv'

filepath = '/content/drive/MyDrive/ColabNotebooks/annotations.csv'

# dataframe

df = pd.read_csv(filepath)

#show first few rows of data

df

#preprocessing tweets

stop_words = set(stopwords.words('english'))

def preprocess_tweet_text(tweet):

  #lowercase each letter

  tweet.lower()

  #remove urls, hashtags, mentions

  tweet = re.sub('http[s]?://(?:[a-zA-Z0-9$-_@.&\+!*\(\),%])+', '', tweet, flags= re.MULTILINE)

  tweet = re.sub('@[a-zA-Z0-9_]+','', tweet)

  tweet = re.sub("#([a-zA-Z0-9_]{1,50})",'', tweet)

  #remove punctuation

  tweet = tweet.translate(str.maketrans('', '', string.punctuation))

  #remove stopwords

  tweet_tokens = word_tokenize(tweet)

  filtered_text = [w for w in tweet_tokens if not w in stop_words]

  #stem words

  ps = PorterStemmer()

  stem_text = [ps.stem(w) for w in filtered_text]

  return " ".join(stem_text)

df['Text'] = df['Text'].apply(preprocess_tweet_text)

print(df)

#import

from sklearn.cluster import KMeans

import numpy as np

data = vector1

print(data)

kmeans1 = KMeans(n_clusters=4, random_state=0, max_iter=250, init='k-means++').fit(data) #61 seconds

print(kmeans1.cluster_centers_)

Output:

[[0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 1. 0.] [0. 0. 0. 1. 0. 0.] ... [0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 1. 0.] [0. 0. 1. 0. 0. 0.]]

[[ 3.60822483e-16 3.60822483e-16 1.00000000e+00 1.16573418e-15 2.38697950e-15 3.60822483e-16] [-4.02455846e-16 -4.02455846e-16 -2.47024623e-15 -1.16573418e-15 1.00000000e+00 -4.02455846e-16] [-9.99200722e-16 -9.99200722e-16 -1.24900090e-15 1.00000000e+00 9.99200722e-16 -9.99200722e-16] [ 5.77350269e-01 5.77350269e-01 1.19348975e-15 1.11022302e-15 2.10942375e-15 5.77350269e-01]]


Al Y.

tutor
Hi, have a look here, but feel free to drop me a line: stackoverflow.com/questions/47291025/extracting-centroids-using-k-means-clustering-in-python
Report

07/08/21

1 Expert Answer

By:

Anxhelo D. answered • 08/30/21

Tutor
New to Wyzant

Passionate Computer Scientist

Still looking for help? Get the right answer, fast.

Ask a question for free

Get a free answer to a quick problem.
Most questions answered within 4 hours.

OR

Find an Online Tutor Now

Choose an expert and meet online. No packages or subscriptions, pay only for the time you need.