Create spark session

In [15]:
# Import SparkSession from pyspark.sql
from pyspark.sql import SparkSession

# Create my_spark_session
# if it already exists, it'll get, otherwise it'll create
my_spark = SparkSession.builder.getOrCreate()

# print session
print(my_spark)

#To view the dataframe, we can do df.show()
<pyspark.sql.session.SparkSession object at 0x1085e0710>

Run SQL on a single file

In [18]:
# create dataframe from the csv
df1 = my_spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('crimes.csv')
df1.createOrReplaceTempView("crimes")

# run SQL against the dataframe
finaldf = spark.sql(" select * from crimes where OffenceGroup = 'Theft offences'")

Joining files

In [34]:
# create 2 dataframes from two files
df1 = my_spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('crimes.csv')
df1.createOrReplaceTempView("crimes")

df2 = my_spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('jointbl.csv')
df2.createOrReplaceTempView("jointbl")

# run SQL against the dataframe, using the temptable aliases as table names
finaldf = spark.sql(''' select 'Financial Year', crimes.ForceName, jointbl.populaton
                        from crimes 
                        left join jointbl on crimes.ForceName = jointbl.ForceName 
                        where OffenceGroup = 'Theft offences' ''')
In [35]:
finaldf.show()
+--------------+--------------+---------+
|Financial Year|     ForceName|populaton|
+--------------+--------------+---------+
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
|Financial Year|West Yorkshire|  3000000|
+--------------+--------------+---------+
only showing top 20 rows

In [ ]: