In [1]:
#import Pandas
import pandas as pd

Pandas Series

In [2]:
#create a pandas series
series_example = pd.Series([2, 17, 19, 21])
print(series_example)
0     2
1    17
2    19
3    21
dtype: int64
In [3]:
#inspect the series
series_example.values
Out[3]:
array([ 2, 17, 19, 21])
In [4]:
#inspect the series to see start and stop index values
series_example.index
Out[4]:
RangeIndex(start=0, stop=4, step=1)
In [6]:
#create a series with a custom index
series2_example = pd.Series([2, 17, 19, 21], index=["2018-10-01", "2018-10-02", "2018-10-03", "2018-10-04"])
series2_example
Out[6]:
2018-10-01     2
2018-10-02    17
2018-10-03    19
2018-10-04    21
dtype: int64
In [7]:
#selecting specific index from the series
series2_example[1]
Out[7]:
17
In [8]:
#We can now change the values, if we like
series2_example[1]= 200
series2_example[1]
Out[8]:
200
In [9]:
#select multiple items, using our custom index
series2_example[["2018-10-01", "2018-10-02"]]
Out[9]:
2018-10-01      2
2018-10-02    200
dtype: int64
In [10]:
#we can do boolean checks on the series
"2018-10-01" in series2_example
Out[10]:
True
In [11]:
"2018-10-11" in series2_example
Out[11]:
False
In [12]:
#we can create a series from a dictionary
dictionary = {"Alfa" : 2111, "Volvo" : 2123, "BMW" : 2132}
newseries = pd.Series(dictionary)
newseries
Out[12]:
Alfa     2111
Volvo    2123
BMW      2132
dtype: int64
In [13]:
#In the above, the index keys will be in the same order as the dictionary. If we want to change the order, we can pass this in:
order = ["BMW", "Volvo", "Alfa"]
newseries = pd.Series(dictionary, index=order)
newseries
Out[13]:
BMW      2132
Volvo    2123
Alfa     2111
dtype: int64
In [14]:
#We can check for null values in our series, using the below:
pd.isnull(newseries)
Out[14]:
BMW      False
Volvo    False
Alfa     False
dtype: bool
In [15]:
#we can also check if the columns are not null
pd.notnull(newseries)
Out[15]:
BMW      True
Volvo    True
Alfa     True
dtype: bool
In [16]:
#The series automatically aligns index labels for arithmetic functions (similar to a join in SQL):
newseriesadd = pd.Series(dictionary, index=order)
newseriesadd + newseries
Out[16]:
BMW      4264
Volvo    4246
Alfa     4222
dtype: int64

Pandas Dataframes

Dataframes are two dimensional arrays (basically a table)

In [18]:
#Convert a dictionary to a dataframe
dataset = {
    "Manufacturer": ["Kia", "Masda", "Mercedes", "Tesla"],
    "Total Sales": [1000000, 2000000, 3434432, 997000],
    "Total Profit": [100000, 200000, 400000, 200000]
}
dataframe = pd.DataFrame(dataset)
dataframe
Out[18]:
Manufacturer Total Sales Total Profit
0 Kia 1000000 100000
1 Masda 2000000 200000
2 Mercedes 3434432 400000
3 Tesla 997000 200000
In [19]:
#We can inspect our newly created dataframe, seeing only the top x rows.
dataframe.head(5)
Out[19]:
Manufacturer Total Sales Total Profit
0 Kia 1000000 100000
1 Masda 2000000 200000
2 Mercedes 3434432 400000
3 Tesla 997000 200000
In [21]:
#Or we can use the tail function to inspect only the bottom x rows.
dataframe.tail(5)
Out[21]:
Manufacturer Total Sales Total Profit
0 Kia 1000000 100000
1 Masda 2000000 200000
2 Mercedes 3434432 400000
3 Tesla 997000 200000
In [23]:
#In the below example, I’m selecting only certain columns from my dataframe:
df2 = dataframe[['Manufacturer', 'Total Sales']]
df2
Out[23]:
Manufacturer Total Sales
0 Kia 1000000
1 Masda 2000000
2 Mercedes 3434432
3 Tesla 997000
In [27]:
#We may want to re-order some columns for easier reading. We can do that using the below:
dataframe = pd.DataFrame(dataset, columns=["Total Profit", "Total Sales", "Manufacturer"])
dataframe
Out[27]:
Total Profit Total Sales Manufacturer
0 100000 1000000 Kia
1 200000 2000000 Masda
2 400000 3434432 Mercedes
3 200000 997000 Tesla
In [28]:
#We may wish to transpose our dataframe (swap the rows and columns around). We can do that as below:
dataframe.T
Out[28]:
0 1 2 3
Total Profit 100000 200000 400000 200000
Total Sales 1000000 2000000 3434432 997000
Manufacturer Kia Masda Mercedes Tesla
In [29]:
#We can also inspect our dataframe using the below function:
dataframe.columns
Out[29]:
Index(['Total Profit', 'Total Sales', 'Manufacturer'], dtype='object')
In [32]:
#We can create a dataframe from a file, as below. In this script, we define the delimiter as being a comma and infer the header from the file.
path = "mock.csv"
df = pd.read_csv(path, delimiter=',', header='infer')
df.head()
Out[32]:
id first_name last_name email gender ip_address
0 1 Kariotta Buesden kbuesden0@histats.com Female 158.189.151.49
1 2 Pier Blasik pblasik1@washingtonpost.com Female 50.142.146.135
2 3 Claudell Shambroke cshambroke2@smugmug.com Male 224.21.101.55
3 4 Glennis Cheeld gcheeld3@yahoo.com Female 60.9.50.65
4 5 Sayer Kielty skielty4@1und1.de Male 95.121.207.185
In [36]:
#We can retrieve a specific field from the dataframe, like below:
df["gender"].head()
Out[36]:
0    Female
1    Female
2      Male
3    Female
4      Male
Name: gender, dtype: object
In [35]:
#Or we could do the same with the below notation:
df.first_name.head()
Out[35]:
0    Kariotta
1        Pier
2    Claudell
3     Glennis
4       Sayer
Name: first_name, dtype: object
In [38]:
#We can select a specific row like this – this assumes we know the index number of that row:
df.loc[2]
Out[38]:
id                                  3
first_name                   Claudell
last_name                   Shambroke
email         cshambroke2@smugmug.com
gender                           Male
ip_address              224.21.101.55
Name: 2, dtype: object

Dataframe Filtering

In [39]:
#We can start interacting with our dataframes with more logic. In the below example, we’re selecting only those records where gender is female.
df2 = df.loc[df["gender"] == 'Female']
df2.head()
Out[39]:
id first_name last_name email gender ip_address
0 1 Kariotta Buesden kbuesden0@histats.com Female 158.189.151.49
1 2 Pier Blasik pblasik1@washingtonpost.com Female 50.142.146.135
3 4 Glennis Cheeld gcheeld3@yahoo.com Female 60.9.50.65
6 7 Haily Eldon heldon6@china.com.cn Female 155.164.158.93
7 8 Horatia Wanjek hwanjek7@disqus.com Female 111.49.4.223
In [40]:
#And in the below, we use the equivalent of an SQL %LIKE% operator & select all email addresses that include ‘oracle’.
df3 = df.loc[df["email"].str.contains('oracle')]
df3.head()
Out[40]:
id first_name last_name email gender ip_address
385 386 Laurel McKeaney lmckeaneyap@oracle.com Female 223.195.187.101
494 495 Isadore Riach iriachdq@oracle.com Male 150.241.133.78
549 550 Silvanus Gelly sgellyf9@oracle.com Male 161.65.47.154
In [41]:
#We can add multiple where conditions, as below. Here, we’re looking for all records where gender is Female and the ID is greater than 5.
df[(df['gender'] == 'Female') & (df['id'] > 5)].head()
Out[41]:
id first_name last_name email gender ip_address
6 7 Haily Eldon heldon6@china.com.cn Female 155.164.158.93
7 8 Horatia Wanjek hwanjek7@disqus.com Female 111.49.4.223
10 11 Lorie Colmer lcolmera@mysql.com Female 124.216.128.87
11 12 Nadeen Ghilardini nghilardinib@plala.or.jp Female 243.164.209.96
12 13 Dinnie Kreutzer dkreutzerc@msn.com Female 183.136.54.204
In [42]:
#And this example does the same as above, but is OR rather than AND. Or is depicted by the pipe.
df[(df['gender'] == 'Female') | (df['id'] > 5)].head()
Out[42]:
id first_name last_name email gender ip_address
0 1 Kariotta Buesden kbuesden0@histats.com Female 158.189.151.49
1 2 Pier Blasik pblasik1@washingtonpost.com Female 50.142.146.135
3 4 Glennis Cheeld gcheeld3@yahoo.com Female 60.9.50.65
5 6 Goraud Orrill gorrill5@histats.com Male 40.79.38.123
6 7 Haily Eldon heldon6@china.com.cn Female 155.164.158.93
In [44]:
#We can select all data where the value is not null, as below:
df2[(df2['gender'].notna())].head()
Out[44]:
id first_name last_name email gender ip_address
0 1 Kariotta Buesden kbuesden0@histats.com Female 158.189.151.49
1 2 Pier Blasik pblasik1@washingtonpost.com Female 50.142.146.135
3 4 Glennis Cheeld gcheeld3@yahoo.com Female 60.9.50.65
6 7 Haily Eldon heldon6@china.com.cn Female 155.164.158.93
7 8 Horatia Wanjek hwanjek7@disqus.com Female 111.49.4.223

Dataframe Arithmetic

In [45]:
#We can sum a particular column within a dataframe using the below script:
df['id'].sum()
Out[45]:
500500
In [46]:
#We can also complete an SQL style sum & groupby clause, as below:
df.groupby(['gender'])[['id']].sum()
Out[46]:
id
gender
Female 249420
Male 251080
In [48]:
#We can group by multiple columns, should we need to:
df.groupby(['gender', 'first_name'])[['id']].sum().head()
Out[48]:
id
gender first_name
Female Abigale 740
Adda 731
Adoree 800
Adrea 426
Adriana 125
In [51]:
#We can also sum and group by multiple columns, as shown below:
df.groupby(['gender', 'first_name'])[['id', 'id']].sum().head()
Out[51]:
id id
gender first_name
Female Abigale 740 740
Adda 731 731
Adoree 800 800
Adrea 426 426
Adriana 125 125
In [53]:
#The below is the equivalent to count(*) in SQL. However, it counts only where the value is not null.
df.groupby(['gender'])[['id']].size()
Out[53]:
gender
Female    498
Male      502
dtype: int64
In [55]:
#We can now calculate the mean, min or max of specific columns using numpy, as below:
import numpy as np
df.groupby('gender').agg({'id': np.mean, 'id': np.max})
Out[55]:
id
gender
Female 999
Male 1000

Joining & union dataframes

In [57]:
'''
In the below, we join two dataframes (I duplicated the dataframe to make this simple). The syntax is:
df1.join(Dataframe_To_Join, field_to_join_on, Type_of_join, left_table_suffix, right_table_suffix). 
'''
#duplicate dataframe
df2 = pd.read_csv(path, delimiter=',', header='infer')

#join
df.join(df2, 'id', 'left', 'df', 'df2').head()
Out[57]:
iddf first_namedf last_namedf emaildf genderdf ip_addressdf iddf2 first_namedf2 last_namedf2 emaildf2 genderdf2 ip_addressdf2
0 1 Kariotta Buesden kbuesden0@histats.com Female 158.189.151.49 2.0 Pier Blasik pblasik1@washingtonpost.com Female 50.142.146.135
1 2 Pier Blasik pblasik1@washingtonpost.com Female 50.142.146.135 3.0 Claudell Shambroke cshambroke2@smugmug.com Male 224.21.101.55
2 3 Claudell Shambroke cshambroke2@smugmug.com Male 224.21.101.55 4.0 Glennis Cheeld gcheeld3@yahoo.com Female 60.9.50.65
3 4 Glennis Cheeld gcheeld3@yahoo.com Female 60.9.50.65 5.0 Sayer Kielty skielty4@1und1.de Male 95.121.207.185
4 5 Sayer Kielty skielty4@1und1.de Male 95.121.207.185 6.0 Goraud Orrill gorrill5@histats.com Male 40.79.38.123
In [58]:
#We may also need to stitch two dataframes together (or create a union), we can do that using the concat function, as below:
pd.concat([df, df2]).head()
Out[58]:
id first_name last_name email gender ip_address
0 1 Kariotta Buesden kbuesden0@histats.com Female 158.189.151.49
1 2 Pier Blasik pblasik1@washingtonpost.com Female 50.142.146.135
2 3 Claudell Shambroke cshambroke2@smugmug.com Male 224.21.101.55
3 4 Glennis Cheeld gcheeld3@yahoo.com Female 60.9.50.65
4 5 Sayer Kielty skielty4@1und1.de Male 95.121.207.185
In [ ]: