"""
Created on Sun Nov 15 22:04:12 2020

@author: arhamze
"""

import csv
import math
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from collections import defaultdict as dfdict


f=open('/home/arhamze/SEMESTER_7/Python Master Course/python data processing and visualization/final_project/ramen-ratings.csv') 
data=csv.reader(f, delimiter =',');


next(data)
##making dataset
dataset=[]
header=['Review #','Brand','Variety','Style','Country','Stars','Top Ten']
        
###menyatukan header dengan data
while True:
    for i in data:
        try:
            d=dict(zip(header,i))
            d['Review #']=int(d['Review #'])
            d['Stars']=float(d['Stars'])
        except ValueError:
            d['Stars']=0
            print ('This Unrated, so we change it with 0 ')
            pass
        dataset.append(d)   
    break
#print(dataset[:10])

This Unrated, so we change it with 0 
This Unrated, so we change it with 0 
This Unrated, so we change it with 0


# =============================================================================
# Statistic
# =============================================================================

#1. Total data
print ('The total of data in dataset is',len(dataset))


#2. Average of Stars Ratings
stars=[i['Stars'] for i in dataset]
average=sum(np.array(stars))/len(dataset); print ('The average from the first method is ',average)
###Or
average_2=np.average(np.array(stars)); print('The average from the second method is', average_2)

The total of data in dataset is 2580
The average from the first method is  3.6504263565891475
The average from the second method is 3.6504263565891466


# =============================================================================
# Visualization
# =============================================================================

###1. Stars line plot
all_stars=[i['Stars'] for i in dataset]
all_stars.sort()

##making axis
stars_total=dfdict(int)  ###making sets of data
for i in all_stars:
    stars_total[i]+=1
#print (stars_total)
X_axis=list(stars_total.keys())
Y_axis=list(stars_total.values())

plt.figure(1)
stars_freq=[]
for stars in stars_total.keys():
    for i in range (0,stars_total.get(stars)):
        stars_freq.append(stars)
#print (stars_freq)
plt.gca().set(ylabel='Total reviewers', xlabel='Stars Rating', title='Histogram',)
plt.hist(stars_freq,bins=15, color='green')
plt.grid()
plt.show()


# =============================================================================
# # =============================================================================
# # Calculating The Brand Popularity
# # ==============================================================================
import itertools as itr
import operator as op
import collections as clc
 

### Most popular brand 
popularity =dfdict(int)
for i in  dataset:
    popularity[i['Brand']]+=1
# print (popularity)
pop=[(popularity[i],i) for i in popularity]
pop.sort()
pop10=pop[-10:]
#print (pop10)
##top ten popularity histogram

### Best rated brand
brand_rating=dfdict(list)

for i in dataset:
    brand_rating[i['Brand']].append(i['Stars']) ###membuat dictionary brand:ratings
#print (brand_rating)

###menghitung rata2
avg_brand={}
for i in brand_rating:
    avg_brand[i] = sum(brand_rating[i])/len(brand_rating[i])
#print (avg_brand)

top_rated_brand=[(avg_brand[i],i) for i in avg_brand if len(brand_rating[i])>30]
top_rated_brand.sort()
top10_rated=top_rated_brand[-10:]; #print (top10_rated)
 
 ####SORTING THE DICTIONARY####
populer_sorted=sorted(popularity.items(), key=op.itemgetter(1), reverse=True)   ##use itemgetter (0) for sorting keys instead value
#print (populer_sorted)
pp_sorted_dict=dict(clc.OrderedDict(populer_sorted)); #print (pp_sorted_dict)


import pandas as pd
 
data_frame=pd.DataFrame.from_dict(pp_sorted_dict, orient='index')
data_frame.reset_index(inplace=True) ###merubah index mnejadi columns
data_frame=data_frame.rename(columns={'index':'Brand',0:'Popularity'}) ####mengganti header dataframe


# =============================================================================
#  Visualization of Brand Top Popularity
# =============================================================================
plt.figure(3)
plt.gca().set(ylabel='Popularity', xlabel='Brand', title='Bar PLot For Top 10 Popular Brand')
plt.grid()
plt.xticks(rotation=60)
plt.bar(data_frame.loc[0:9,'Brand'],data_frame.loc[0:9,'Popularity'])
 
# =============================================================================
#  Visualization of Brand Top Rated
# =============================================================================
brand_top_rated=dict(clc.OrderedDict(top_rated_brand))
brnd_tp_rated_sorted=sorted(brand_top_rated.items(),key=op.itemgetter(0), reverse=True)
brnd_tp_rated_sorted_fix=dict(clc.OrderedDict(brnd_tp_rated_sorted))
df_rate=pd.DataFrame.from_dict(brnd_tp_rated_sorted_fix, orient='index')
df_rate.reset_index(inplace=True)
df_rate=df_rate.rename(columns={'index':'Rating', 0:'Brand'})
#print (df_rate)

plt.figure (4)
plt.gca().set(ylabel='Ratings', xlabel='Brand', title='Bar PLot For Top 10 Rating Brand ', ylim=(3,5))
plt.grid()
plt.xticks(rotation=60)
plt.bar(df_rate.loc[0:9,'Brand'],df_rate.loc[0:9,'Rating'])


#sorted_values = sorted(popularity.values()) # Sort the values
#popularity_sorted= {}
#for i in sorted_values:
#    for k in popularity.keys():
#        if popularity[k] == i:
#            popularity_sorted[k] = popularity[k]
#            break
#print (popularity_sorted)

<BarContainer object of 10 artists>

Basic Python Data Processing and Visualization

Basic Data Processing and Visualization¶

Ramens Ratings Dataset¶

The Dataset of Ramens Ratings¶

PART 1 : Import The Raw Data and Create Dataset¶

PART 2 :Do Some Simple Statistic to The Data¶

PART 3: Visualizing The Ratings Distribution from total reviews¶

PART 4 : Visualizing The Brand Popularity and Brand Top Rated¶

This is the end, enjoy and thank you¶

Comments

Basic Data Processing and Visualization¶

Ramens Ratings Dataset¶

The Dataset of Ramens Ratings¶

PART 1 : Import The Raw Data and Create Dataset¶

PART 2 :Do Some Simple Statistic to The Data¶

PART 3: Visualizing The Ratings Distribution from total reviews¶

PART 4 : Visualizing The Brand Popularity and Brand Top Rated¶

This is the end, enjoy and thank you¶

Related Posts:

Comments