Data Science
Learn How to Investigate The 10,000 movies Dataset (TMDb).
In [1]:
# All used packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
In [2]:
# Function to split columns containing several values.
def spl(df,split_c,y):
new_df = pd.DataFrame()
# apply split function to each column
for c in split_c:
df_s = df[c].str.split(y, expand=True)
# Rename new columns
df_s = df_s.add_prefix(c)
# merge new columns
new_df = new_df.merge(df_s, how='outer' , left_index=True, right_index=True)
# Remove old columns (that containing several values)
df=df.drop(split_c,axis=1)
# merge new columns with dataframe
new_df = df.merge(new_df, how='outer' , left_index=True, right_index=True)
return(new_df)
In [3]:
# Function to combine multiple columns into different shapes.
def comp( dataframe , multiple_columns , compare_with):
a = multiple_columns
b = compare_with
x = dataframe
d_a = x.loc[:,a]
d_b = x.loc[:,[b]]
df_m = d_b.merge( d_a, how='outer' , left_index=True, right_index=True)
df_m = df_m.set_index(b)
df_s = df_m.stack()
df_s = pd.DataFrame(df_s)
df_m
df_f = df_m[a[0]]
k = a[1:]
for n in k:
df_f = df_f.append(df_m[n])
df_f = pd.DataFrame(df_f)
return(df_s , df_f)
In [4]:
# Load data
df = pd.read_csv('tmdb-movies.csv')
df.head(10)
Out[4]:
In [5]:
df.shape
Out[5]:
In [6]:
df.nunique()
Out[6]:
In [7]:
df.info()
In [8]:
df.describe()
Out[8]:
In [9]:
df.duplicated().sum()
Out[9]:
In [10]:
df[df.duplicated()]
Out[10]:
In [11]:
df.isnull().sum(1).sum()
Out[11]:
In [12]:
df.hist(figsize=(8,8));
In [13]:
# Remove non-important columns
df=df.drop(['id', 'imdb_id', 'homepage', 'tagline', 'keywords', 'overview','release_date'],axis=1)
df.head(1)
Out[13]:
In [14]:
# Remove duplicated rows.
df=df.drop(df[df.duplicated()].index)
df.duplicated().sum()
Out[14]:
In [15]:
# Remove rows with null values.
df=df.dropna()
df.isnull().sum(1).sum()
Out[15]:
In [16]:
# Remove rows with 0 values.
df=df[df!=0]
df=df.dropna()
df.describe()
Out[16]:
In [17]:
# Apply split function:
df_n=spl( df , ['cast', 'genres', 'production_companies'] , '|')
df_n.info()
In [18]:
# Apply combine Function to combine multiple columns into different shapes.
a,b = comp( df_n , ['cast0', 'cast1', 'cast2', 'cast3', 'cast4'] , 'original_title')
In [19]:
# Fast look
a
Out[19]:
In [20]:
# The answer:
b[0].value_counts().sort_values(ascending=False).head(1)
Out[20]:
In [21]:
# The top 20:
T_act = b[0].value_counts().sort_values(ascending=False).head(20)
T_act
Out[21]:
In [22]:
# visualization to the top 20 actors using Bar Charts:
T_act.plot(kind="bar")
plt.title("the top actors")
plt.xlabel("Actor Name")
plt.ylabel("Number of movies")
Out[22]:
In [23]:
# Apply combine Function.
a,b = comp( df_n , ['director'] , 'original_title')
In [24]:
# Fast look
a
Out[24]:
In [25]:
# The answer:
b.iloc[:,0].value_counts().sort_values(ascending=False).head(1)
Out[25]:
In [26]:
# The top 20:
T_act = b.iloc[:,0].value_counts().sort_values(ascending=False).head(20)
T_act
Out[26]:
In [27]:
# visualization to the top 20 directors using Bar Charts:
T_act.plot(kind="bar")
plt.title("the top directors")
plt.xlabel("Director Name")
plt.ylabel("Number of movies")
Out[27]:
In [28]:
# Apply combine Function to combine multiple columns into different shapes.
a,b = comp( df_n , ['cast0', 'cast1', 'cast2', 'cast3', 'cast4'] , 'revenue_adj')
In [29]:
# Fast look
b.columns =['Actors']
b=b.reset_index()
l=b.groupby(by='Actors').sum()
l.iloc[:,0].sort_values(ascending=False)
Out[29]:
In [30]:
# The answer:
l.iloc[:,0].sort_values(ascending=False).head(1)
Out[30]:
In [31]:
# The top 20:
T_act = l.iloc[:,0].sort_values(ascending=False).head(20)
T_act
Out[31]:
In [32]:
# visualization to the top 20 actors using Bar Charts:
T_act.plot(kind="bar")
plt.title("the top actors")
plt.xlabel("Actor Name")
plt.ylabel("Revenue")
Out[32]:
In [33]:
# Apply combine Function.
a,b = comp( df_n , ['director'] , 'revenue_adj')
In [34]:
# Fast look
b.columns =['Director']
b=b.reset_index()
l=b.groupby(by='Director').sum()
l.iloc[:,0].sort_values(ascending=False)
Out[34]:
In [35]:
# The answer:
l.iloc[:,0].sort_values(ascending=False).head(1)
Out[35]:
In [36]:
# The top 20:
T_act = l.iloc[:,0].sort_values(ascending=False).head(20)
T_act
Out[36]:
In [37]:
# visualization to the top 20 actors using Bar Charts:
T_act.plot(kind="bar");
plt.title("the top director")
plt.xlabel("Director Name")
plt.ylabel("Revenue")
Out[37]:
In [38]:
# Apply combine Function to combine multiple columns into different shapes.
a,b = comp( df_n , ['genres0', 'genres1', 'genres2', 'genres3', 'genres4'] , 'release_year')
In [39]:
# Fast look
a
Out[39]:
In [40]:
# The answer:
b[0].value_counts().sort_values(ascending=False).head(1)
Out[40]:
In [41]:
# The top 20:
T_act = b[0].value_counts().sort_values(ascending=False).head(20)
T_act
Out[41]:
In [42]:
# visualization to the top 20 actors using Bar Charts:
T_act.plot(kind="bar")
plt.title("the top genres")
plt.xlabel("Genres Name")
plt.ylabel("Number of movies")
Out[42]:
In [43]:
b.columns =['Genres']
b=b.reset_index()
b["Y_G"] = b["release_year"].astype(str) + "_" + b["Genres"]
b.head()
Out[43]:
In [44]:
T_act = b.iloc[:,2].value_counts()
T_act = pd.DataFrame(T_act)
T_act = T_act.reset_index()
In [45]:
T_act.columns=['Y_G','N']
T = T_act['Y_G'].str.split('_', expand=True)
T.columns=['release_year','Genres']
T['N'] = T_act['N']
t = T.sort_values(by = 'release_year')
df = t.reset_index( drop = True )
#df = t.set_index(['release_year'])
#df = df.transpose()
df
Out[45]:
In [46]:
df = df.pivot(index = 'release_year' , columns = 'Genres' , values = 'N' )
In [47]:
df.plot(kind="bar" , figsize = (20,15));
plt.title("Most popular genres from year to year")
plt.xlabel("Release year")
plt.ylabel("Number of movies")
Out[47]:
In [51]:
import pylab as pl
df.plot(kind="line" , figsize = (20,15));
plt.title("Most popular genres from year to year")
plt.xlabel("Change over years")
plt.ylabel("Number of movies")
Out[51]:
In [49]:
df.plot( kind="hist" , figsize = (10,7));
plt.title("Most popular genres from year to year")
plt.xlabel("Change over years")
plt.ylabel("Number of movies")
df = df.cumsum();
plt.figure();
- Andrew Samy
- Mar, 27 2022
Add New Comments
Please login in order to make a comment.