写在前面
最近在学习python,结合一个实际案例,写一下python和R在做数据分析上的差异。
本人还不是特别熟练python,所以python的代码来自于kagle的一个高vote回帖。
我这里只是转写一下R的版本,转写python代码之后感觉python做数据分析和可视化实在不如R给力。代码丢这了,有机会说说如何用tidyverse分析数据吧。
这里写了多数代码,剩下流程差不多的就放弃写了。还有机器学习的部分回头有心情了用tidymodels写一下基本的框架吧。
Netflix is an application that keeps growing bigger and faster with its popularity, shows and content. This is an EDA
or a story telling through its data along with a content-based recommendation system and a wide range of different
graphs and visuals.
The python source code is from here
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
library(tidyverse)
library(skimr)
# Loading the dataset
data <- tidytuesdayR::tt_load('2021-04-20')
netfix_dta <- data$netflix_titles
# install a module if your python don't have
# reticulate::py_install('seaborn',pip = TRUE)
Pass the data to Python from R in rstudio
netflix_overall=r.netfix_dta
netflix_overall.head()
Also, you can do the same thing using R
head(netfix_dta)
Or
glimpse(netfix_dta)
Therefore, it is clear that the dataset contains 12 columns for exploratory analysis.
netflix_overall.count()
Also, in R you can do it better.
skim(netfix_dta)
netflix_shows=netflix_overall[netflix_overall['type']=='TV Show']
netflix_shows.head()
In R, you can use pipe to repeat, which makes your script easy to read.
netflix_shows <- netfix_dta %>%
filter(type == "TV Show")
head(netflix_shows)
netflix_movies=netflix_overall[netflix_overall['type']=='Movie']
netflix_movies <- netfix_dta %>%
filter(type == "Movie")
Analysis of Movies vs TV Shows.
sns.set(style="darkgrid")
ax = sns.countplot(x="type", data=netflix_overall, palette="Set2")
plt.show()
In R
netfix_dta %>%
ggplot(aes(x = fct_rev(type), fill = type)) +
geom_bar() +
theme_bw()
It is evident that there are more Movies on Netflix than TV shows.
```{python} md
# If a producer wants to release some content, which month must he do so?( Month when least amount of content is added)
```{python}
netflix_date = netflix_shows[['date_added']].dropna()
netflix_date['year'] = netflix_date['date_added'].apply(lambda x : x.split(', ')[-1])
netflix_date['month'] = netflix_date['date_added'].apply(lambda x : x.lstrip().split(' ')[0])
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df = netflix_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T
plt.figure(figsize=(10, 7), dpi=200)
plt.pcolor(df, cmap='afmhot_r', edgecolors='white', linewidths=2) # heatmap
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index, fontsize=7, fontfamily='serif')
plt.title('Netflix Contents Update', fontsize=12, fontfamily='calibri', fontweight='bold', position=(0.20, 1.0+0.02))
cbar = plt.colorbar()
cbar.ax.tick_params(labelsize=8)
cbar.ax.minorticks_on()
plt.show()
library(lubridate)
library(viridis)
netfix_dta %>%
select(date_added) %>%
mutate(date_added = mdy(date_added),
month = month(date_added, label = TRUE, abbr = FALSE),
year = year(date_added)) %>%
group_by(year, month) %>%
filter(!is.na(month)) %>%
summarise(contents = n()) %>%
ggplot(aes(x = year, y = fct_rev(month), fill = contents)) +
geom_tile() +
viridis::scale_fill_viridis(option = "A") +
labs(title = 'Netflix Contents Update',
x = '',
y = '')
Movie ratings analysis
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=netflix_movies, palette="Set2", order=netflix_movies['rating'].value_counts().index[0:15])
plt.show()
In R
netfix_dta %>%
group_by(rating) %>%
summarise(n = n()) %>%
filter(!is.na(rating)) %>%
ggplot(aes(x = fct_reorder(rating,n, .desc = TRUE), y = n, fill = rating)) +
geom_bar(stat = "identity", show.legend = F) +
scale_y_continuous(expand = expansion(c(0,.1))) +
labs(
x = 'Rating',
y = 'Count'
)
Analysing IMDB ratings to get top rated movies on Netflix
imdb_ratings=pd.read_csv('netflix/IMDb ratings.csv',usecols=['weighted_average_vote'])
imdb_titles=pd.read_csv('netflix/IMDb movies.csv', usecols=['title','year','genre'])
ratings = pd.DataFrame({'Title':imdb_titles.title, 'Release Year':imdb_titles.year, 'Rating': imdb_ratings.weighted_average_vote, 'Genre':imdb_titles.genre})
ratings.drop_duplicates(subset=['Title','Release Year','Rating'], inplace=True)
ratings.shape
ratings.head()
In R
imdb_ratings <- read_csv('netflix/IMDb ratings.csv') %>%
select(1,2)
imdb_titles <- read_csv('netflix/IMDb movies.csv') %>%
select(1, title, year, genre)
ratings <- left_join(imdb_titles, imdb_ratings, by = "imdb_title_id") %>%
select(-1) %>%
select(1:3,Rating = "weighted_average_vote")
ratings
dim(ratings)
ratings.dropna()
joint_data=ratings.merge(netflix_overall,left_on='Title',right_on='title',how='inner')
joint_data=joint_data.sort_values(by='Rating', ascending=False)
joint_data.head()
joint_data.shape
joint_data <- ratings %>%
filter(!is.na(.)) %>%
inner_join(., netfix_dta, by = "title") %>%
arrange(desc(Rating))
dim(joint_data)
import plotly.express as px
top_rated=joint_data[0:10]
top_rated
fig =px.sunburst(
top_rated,
path=['title','country'],
values='Rating',
color='Rating')
fig.show()
library(plotly)
top_rated <- joint_data[1:10,]
fig <- plot_ly(
ids = c(top_rated$title, paste0(top_rated$title,"-",top_rated$country)),
labels = c(top_rated$title,top_rated$country),
parents = c(rep('',10), top_rated$title),
colors = c(top_rated$Rating,top_rated$Rating),
type = "sunburst",
branchvalues = 'total'
)
fig
fig =px.sunburst(
r.top_rated,
path=['title','country'],
values='Rating',
color='Rating')
fig.show()
Countries with highest rated content.
country_count=joint_data['country'].value_counts().sort_values(ascending=False)
country_count=pd.DataFrame(country_count)
topcountries=country_count[0:11]
topcountries
topcountries <- joint_data %>%
group_by(country) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
filter(!is.na(country))
import plotly.express as px
data = dict(
number=[1063,619,135,60,44,41,40,40,38,35],
country=["United States", "India", "United Kingdom", "Canada", "Spain",'Turkey','Philippines','France','South Korea','Australia'])
fig = px.funnel(data, x='number', y='country')
fig.show()
library(reticulate)
data <- py$data %>%
as.data.frame() %>%
arrange(desc(number))
plot_ly(
y = data$country,
x = data$number,
type = "funnel",
) %>%
layout(yaxis = list(categoryarray = data$country))
Year wise analysis
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y="release_year", data=netflix_movies, palette="Set2", order=netflix_movies['release_year'].value_counts().index[0:15])
plt.show()
netflix_movies %>%
group_by(release_year) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
slice(1:15) %>%
mutate(release_year = factor(release_year, levels = release_year)) %>%
ggplot(aes(y = fct_rev(release_year), x = n, fill = release_year)) +
geom_bar(stat = "identity",show.legend = FALSE) +
ggsci::scale_fill_simpsons()
Analysis of duration of movies¶
netflix_movies['duration']=netflix_movies['duration'].str.replace(' min','')
netflix_movies['duration']=netflix_movies['duration'].astype(str).astype(int)
netflix_movies['duration']
plt.figure(figsize=(8,8))
sns.set(style="darkgrid")
sns.kdeplot(data=netflix_movies['duration'], shade=True)
plt.show()
netflix_movies %>%
mutate(duration = str_remove(duration, " min") %>% as.double()) %>%
ggplot(aes(x = duration)) +
geom_density(fill = "blue2",alpha = .4) +
ggthemes::theme_solarized()
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from collections import Counter
genres=list(netflix_movies['listed_in'])
gen=[]
for i in genres:
i=list(i.split(','))
for j in i:
gen.append(j.replace(' ',""))
g=Counter(gen)
text = list(set(gen))
plt.rcParams['figure.figsize'] = (13, 13)
wordcloud = WordCloud(max_words=1000000,background_color="white").generate(str(text))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
library(wordcloud)
library(tidytext)
set.seed(2021)
netflix_movies %>%
unnest_tokens(word, listed_in) %>%
count(word, sort = TRUE) %>%
with(wordcloud(word, n, max.words = 100))
matplotlib.use('TkAgg')
g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse= True)}
g
fig, ax = plt.subplots()
x=list(g.keys())
y=list(g.values())
ax.vlines(x, ymin=0, ymax=y, color='green')
ax.plot(x,y, "o", color='maroon')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel("Count of movies")
# set a title
ax.set_title("Genres")
plt.show()
g <- py$g %>% unlist() %>% data.frame() %>% select(n = ".")
g %>%
mutate(name = rownames(g),
name = fct_reorder(name, n, .desc = TRUE)) %>%
ggplot(aes(x = name, y = n)) +
geom_segment(aes(x = name, xend = name, y= 0, yend = n)) +
geom_point(size = 5, color = 'orange') +
theme(
axis.text.x = element_text(angle = 90, hjust = 1)
)
Lowest number of seasons.
features=['title','duration']
durations= netflix_shows[features]
durations['no_of_seasons']=durations['duration'].str.replace(' Season','')
#durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)
durations['no_of_seasons']=durations['no_of_seasons'].str.replace('s','')
durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)
t=['title','no_of_seasons']
top=durations[t]
top=top.sort_values(by='no_of_seasons', ascending=False)
bottom=top.sort_values(by='no_of_seasons')
bottom=bottom[20:50]
import plotly.graph_objects as go
# Set the width and height of the figure
plt.figure(figsize=(15,15))
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'No of seasons']), cells=dict(values=[bottom['title'],bottom['no_of_seasons']],fill_color='lavender'))])
fig.show()
library(kableExtra)
netflix_shows %>%
select(title, duration) %>%
separate(duration, ' ',into = c('duration','season')) %>%
mutate(duration = as.numeric(duration)) %>%
arrange(desc(duration)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))