import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import stats
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
US_comments = pd.read_csv('UScomments.csv', error_bad_lines = False)
US_comments.head()
US_videos = pd.read_csv('USvideos.csv', error_bad_lines = False)
US_videos.head()
US_videos.shape
US_videos.nunique()
US_videos.info()
US_comments.shape
US_comments.nunique()
US_comments.isnull().sum()
US_comments.dropna(inplace=True)
US_comments.isnull().sum()
US_comments.shape
US_comments.nunique()
US_comments.info()
US_comments.drop(41587, inplace=True)
US_comments.likes = US_comments.likes.astype(int)
US_comments.replies = US_comments.replies.astype(int)
US_comments.head()
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
US_comments['Sentiment Scores'] = US_comments['comment_text'].apply(lambda x:sia.polarity_scores(x)['compound'])
US_comments.head()
US_comments['Sentiment'] = US_comments['Sentiment Scores'].apply(lambda s : 'Positive' if s > 0 else ('Neutral' if s == 0 else 'Negative'))
US_comments.head()
US_comments.Sentiment.value_counts()
US_comments.video_id.nunique()
US_comments[US_comments.video_id == US_comments.video_id.unique()[0]]['Sentiment'].value_counts()
videos = []
for i in range(0,US_comments.video_id.nunique()):
a = US_comments[(US_comments.video_id == US_comments.video_id.unique()[i]) & (US_comments.Sentiment == 'Positive')].count()[0]
b = US_comments[US_comments.video_id == US_comments.video_id.unique()[i]]['Sentiment'].value_counts().sum()
Percentage = (a/b)*100
videos.append(round(Percentage,2))
Positivity = pd.DataFrame(videos,US_comments.video_id.unique()).reset_index()
Positivity.columns = ['video_id','Positive Percentage']
Positivity.head()
channels = []
for i in range(0,Positivity.video_id.nunique()):
channels.append(US_videos[US_videos.video_id == Positivity.video_id.unique()[i]]['channel_title'].unique())
Positivity['Channel'] = channels
Positivity.head()
Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].max()]
Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].min()]
max_positivity = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].max()])
min_positivity = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].min()])
min_positivity.to_csv("Worst_Channel_influence.csv") #these are the channels with the worst comments on their videos
max_positivity.to_csv("Best_Channel_influence.csv") #these are the channels with the best comments on their videos
all_channel_sentiment = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage']])
all_channel_sentiment.to_csv("All_Channel_Sentiment.csv")
sns.set_palette('husl')
%pylab inline
import math
data = pd.read_csv('All_Channel_Sentiment_OG.csv')
data.head()
data.info()
data.describe()
data["Positive_Percentage"].describe()
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline
sns.set_palette('husl')
plt.figure(figsize=(22.5,10))
sns.distplot(data["Positive_Percentage"], color="skyblue", label="Toxicity Distribution")