Sunday, April 28, 2019

Multi delmiter spark

import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}

object PaymentFile extends  App {


  implicit val spark = SparkSession.builder().appName("PaymentFile")
    .config("spark.master", "local")
    .getOrCreate()

 val rdd = spark.sparkContext.textFile("C:\\Senthil\\SenStudy\\Scala\\Files\\multidelimiter.txt")
  val header = rdd.filter(_.contains("input")).map(line => line.split("\\!\\~")).first()
  val schema = StructType(header.map(cols => StructField(cols,StringType)).toSeq)
  val data = spark.createDataFrame(rdd.filter(!_.contains("input"))
    .map(line => Row.fromSeq(line.split("\\!\\~").toSeq)), schema)
  data.write.partitionBy("input").mode("overwrite").parquet("C:\\Senthil\\SenStudy\\Scala\\Files\\multidelimiter")

}

No comments:

Post a Comment