Domain Normalization

This script aims to normalise domain names. That is, drop the subdomain from a list of domains, to 'clean' them and be able to analyse the usage across a whole domain much more easily. This dataset is from https://www.domcop.com/top-10-million-domains

In [1]:
#Import all the required modules
import pandas as pd
import time 
from datetime import datetime 
import os
import glob

#define the path to the csv file & read it into a dataframe
path = "top10milliondomains.csv"
df = pd.read_csv(path, delimiter=',', header='infer', encoding = "ISO-8859-1")
In [2]:
df.head()
Out[2]:
Rank Domain Open Page Rank
0 1 fonts.googleapis.com 10.0
1 2 facebook.com 10.0
2 3 youtube.com 10.0
3 4 twitter.com 10.0
4 5 google.com 10.0
In [16]:
#strip out all ---- values
df2 = df[['Domain']].loc[df['Domain'] != '----']
df2.head(5)
Out[16]:
Domain
0 fonts.googleapis.com
1 facebook.com
2 youtube.com
3 twitter.com
4 google.com
In [45]:
#DEFINE REQUIRED LOOKUP LISTS AND AN EMPTY LIST TO APPEND TO
tld = ('co.uk', 'com', 'org', 'gov.uk', 'co', 'net', 'news', 'it', 'in' 'es', 'tw', 'pe', 'io', 'ca', 'cat', 'com.au','com.ar', 'com.mt', 'com.co', 'ws', 'to', 'es', 'de', 'us', 'br', 'im', 'gr', 'cc', 'cn', 'org.uk', 'me', 'ovh', 'be','tv', 'tech', '..', 'life', 'com.mx', 'pl', 'uk', 'ru', 'cz', 'st', 'info', 'mobi', 'today', 'eu', 'fi', 'jp', 'life','1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'earth', 'ninja', 'ie', 'im', 'ai', 'at', 'ch', 'ly', 'market', 'click','fr', 'nl', 'se')
cleandomain = []
In [46]:
index = df2.Domain.str.split('.').tolist()
def domfunction():
    try:
        #if the last two parts of the domain are in the TLD list, then take those parts, plus the piece before them
        if len(x) > 2 and str(x[len(x)-2]).rstrip()+'.'+ str(x[len(x)-1]).rstrip() in tld:  
            try:
                cleandomain.append(str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+str(x[len(x)-1]))
            except IndexError:
                cleandomain.append(str(x))
            except TypeError:
                cleandomain.append(str(x))
                
        #if the last part of the domain is in the TLD list, then take that part, plus the piece before it
        elif len(x) > 1 and str(x[len(x)-1]).rstrip() in tld:  
            try:
                cleandomain.append(str(x[len(x)-2])+'.'+str(x[len(x)-1]))
            except IndexError:
                cleandomain.append(str(x))
            except TypeError:
                cleandomain.append(str(x))

        #if the last three parts of the domain are in the TLD list, then take those parts, plus the piece before them
        elif len(x) > 1 and str(x[len(x)-3]).rstrip()+'.'+ str(x[len(x)-2]).rstrip()+'.'+ str(x[len(x)-1]).rstrip()  in tld:  
            try:
                cleandomain.append(str(x[len(x)-4])+'.'+str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+str(x[len(x)-1]))
            except IndexError:
                cleandomain.append(str(x))
            except TypeError:
                cleandomain.append(str(x))
        else:
            cleandomain.append(str(x))
    except IndexError:
        cleandomain.append(str(x))
    except TypeError:
        cleandomain.append(str(x))
In [47]:
#Loop through each of the index items (index = df2.Domain.str.split('.').tolist()) and run the domain function
for x in index:
    domfunction()
In [48]:
#Convert the list to a Pandas series and add the series values as a new column in the dataframe
se = pd.Series(cleandomain)
df2['newdomain2'] = se.values
In [49]:
#as you can see below, fonts.googleapis.com has been correctly altered
df2.head()
Out[49]:
Domain newdomain2
0 fonts.googleapis.com googleapis.com
1 facebook.com facebook.com
2 youtube.com youtube.com
3 twitter.com twitter.com
4 google.com google.com
In [50]:
#More examples of correctly working script:
df2
Out[50]:
Domain newdomain2
0 fonts.googleapis.com googleapis.com
1 facebook.com facebook.com
2 youtube.com youtube.com
3 twitter.com twitter.com
4 google.com google.com
5 plus.google.com google.com
6 instagram.com instagram.com
7 linkedin.com linkedin.com
8 s.w.org w.org
9 pinterest.com pinterest.com
10 cdnjs.cloudflare.com cloudflare.com
11 en.wikipedia.org wikipedia.org
12 youtu.be youtu.be
13 itunes.apple.com apple.com
14 blogger.com blogger.com
15 goo.gl ['goo', 'gl']
16 maps.google.com google.com
17 gmpg.org gmpg.org
18 bit.ly bit.ly
19 vimeo.com vimeo.com
20 github.com github.com
21 docs.google.com google.com
22 amazon.com amazon.com
23 play.google.com google.com
24 wordpress.org wordpress.org
25 3.bp.blogspot.com blogspot.com
26 googletagmanager.com googletagmanager.com
27 flickr.com flickr.com
28 support.google.com google.com
29 wp.me wp.me
... ... ...
9999970 vhz-online.us16.list-manage2.com list-manage2.com
9999971 victoriatapperphotography.us16.list-manage2.com list-manage2.com
9999972 vidresif.us16.list-manage2.com list-manage2.com
9999973 virginialaken.us16.list-manage2.com list-manage2.com
9999974 visionawaken.us16.list-manage2.com list-manage2.com
9999975 wakenation.us16.list-manage2.com list-manage2.com
9999976 wavesurfsupply.us16.list-manage2.com list-manage2.com
9999977 webflow.us16.list-manage2.com list-manage2.com
9999978 weebly.us16.list-manage2.com list-manage2.com
9999979 wellnesswatered.us16.list-manage2.com list-manage2.com
9999980 westernprec.us16.list-manage2.com list-manage2.com
9999981 workincrypto.us16.list-manage2.com list-manage2.com
9999982 wyld.us16.list-manage2.com list-manage2.com
9999983 yeswave.us16.list-manage2.com list-manage2.com
9999984 youtube.us16.list-manage2.com list-manage2.com
9999985 z-quadrat-mainz.us16.list-manage2.com list-manage2.com
9999986 zeistermagazine.us16.list-manage2.com list-manage2.com
9999987 100layers.us2.list-manage2.com list-manage2.com
9999988 10t.us2.list-manage2.com list-manage2.com
9999989 1berkshire.us2.list-manage2.com list-manage2.com
9999990 255.us2.list-manage2.com list-manage2.com
9999991 3greentree.us2.list-manage2.com list-manage2.com
9999992 4xma.us2.list-manage2.com list-manage2.com
9999993 625paper.us2.list-manage2.com list-manage2.com
9999994 62n.us2.list-manage2.com list-manage2.com
9999995 7ideas.us2.list-manage2.com list-manage2.com
9999996 8th-wave.us2.list-manage2.com list-manage2.com
9999997 8thgradenothing.us2.list-manage2.com list-manage2.com
9999998 8thlife.us2.list-manage2.com list-manage2.com
9999999 943kat.us2.list-manage2.com list-manage2.com

10000000 rows × 2 columns

In [ ]: