Create Spark Session

In [ ]:
#!/usr/bin/env python
from pyspark.sql import *
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.functions import *
import pyspark.sql.functions as sf
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, FloatType

# Create my_spark_session
# if it already exists, it'll get, otherwise it'll create
my_spark = SparkSession.builder.appName('myapp').getOrCreate()

# print session
print(my_spark)

Define shema

In [ ]:
schema = [\
    StructField('field1',StringType(), True),\
    StructField('field2',StringType(), True),\
    StructField('field3',StringType(), True),\
    StructField('field4',StringType(), True),\
    StructField('field5',StringType(), True),\
    StructField('field6',StringType(), True)
   ]

final_structure = StructType(schema)

Read data into dataframe with schema applied

In [ ]:
yesterday = datetime.fromtimestamp(int(datetime.now().strftime('%s')) - 60*60*24).strftime('%Y%m%d')

df1 = my_spark.read\
    .option("header","false")\
    .option("delimiter", "\t")\
    .csv('hdfs://path_to_file/dt=' + yesterday + '/*/*/*', final_structure)
df1.printSchema()

Write the dataframe to HDFS

In [ ]:
df1.write.csv("hdfs://path_to_file/spark_exports/feed/" + yesterday + "/analysis.csv")