In [5]:
#pip install geopy
import geopy.distance
from pyhive import hive
import pandas as pd

host_name = "ip address"
port = port number
user = "username"
password = "password"
database="dbname"

def hiveconnection(host_name, port, user,password, database):
    conn = hive.Connection(host=host_name, port=port, username=user, password=password,
                           database=database, auth='CUSTOM')
    cur = conn.cursor()
    cur.execute('select query .....')
    result = cur.fetchall()

    return result

# Call above function
output = hiveconnection(host_name, port, user,password, database)

Find cell entry and exit time

In [187]:
df = pd.DataFrame(output, columns =['userid', 'entry_time', 'cell-site', 'latitude', 'longitude']).sort_values(by=['userid', 'entry_time']).reset_index(drop=True)
df_sorted = df.assign(exit_time=df.groupby('userid').entry_time.shift(periods=-1))

Calculate time in cell & home cell

In [188]:
from datetime import datetime
df_sorted['time_in_cell'] = pd.to_datetime(df_sorted['exit_time'])-pd.to_datetime(df_sorted['entry_time'])
In [181]:
dfx = df_sorted.groupby(['userid','cell-site'])[['time_in_cell']].agg('sum').reset_index()
dfx['Cell_Rank'] =dfx.groupby(['userid'])['time_in_cell'].rank(ascending=False)
top_cells = dfx.where(dfx['Cell_Rank'] ==  1)
top_cells.dropna(thresh=2)
Out[181]:
userid cell-site time_in_cell Cell_Rank
48 customer1 cell1 5.22 1.0
147 customer2 cell2 12:39:13.918000 1.0
197 customer3 cell3 13:57:21.866000 1.0
In [189]:
import math
import numpy as np
df_geo = df.assign(next_lat=df.groupby('userid').latitude.shift(periods=-1))
df_geo = df_geo.assign(next_long=df.groupby('userid').longitude.shift(periods=-1))

for index, row in df_geo.iterrows():   
    start = (row['latitude'], row['longitude'])
    end = (row['next_lat'], row['next_long'])
    
    try:
        df_geo.loc[index+1,'distance'] = geopy.distance.geodesic(start, end).km 
    except:
        df_geo.loc[index+1, 'distance'] = np.nan
In [190]:
df_geox = df_geo.groupby(['userid'])[['distance']].agg('sum').reset_index()
df_geox
Out[190]:
userid distance
0 customer1 487.482911
1 customer2 759.938315
2 customer3 136.951007
In [ ]: