SFTP API


Design Considerations

Spark SFTP connector


Create Hive Table Catalog

The following hive table points to DropZone (SFTP server)

  CREATE EXTERNAL TABLE pcatalog.sftp_drop_zone (
        payload string)
      ROW FORMAT SERDE
        'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
      STORED AS INPUTFORMAT
        'org.apache.hadoop.mapred.TextInputFormat'
      OUTPUTFORMAT
        'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
      LOCATION
        'hdfs://horton/tmp/pcatalog/sftp_drop_zone'
      TBLPROPERTIES (
        'gimel.sftp.host'='dropzone.paypalcorp.com',
        'gimel.storage.type'='SFTP')

Supported File Types


Password Options


SFTP GIMEL Read API for CSV

val options = Map("gimel.sftp.username" -> "bagopalan",
                  "gimel.sftp.file.password.strategy" -> "file"
                  "gimel.sftp.file.password.source" -> "local",
                  "gimel.sftp.file.password.path" -> "/x/home/xxxx/mypass.txt",
                  "hdfsTempLocation" -> "/tmp/basu", 
                  "header" -> "false", 
                  "gimel.sftp.filetype" -> "csv", 
                  "gimel.sftp.file.location" -> "bus_use.csv" )
val sftpDF = dataSet.read("pcatalog.SFTP.DropZone.default.Files", options )
sftpDF.show


SFTP GIMEL Write API JSON

val options = Map("gimel.sftp.username" -> "bagopalan",
                  "gimel.sftp.file.password.strategy" -> "file"
                  "gimel.sftp.file.password.source" -> "hdfs",
                  "gimel.sftp.file.password.path" -> "hdfs://horton/user/xxxxxx/mypass.txt",
                  "hdfsTempLocation" -> "/tmp/basu", 
                  "header" -> "false", 
                  "gimel.sftp.filetype" -> "json", 
                  "gimel.sftp.file.location" -> "myJsonNew.json" )
val sftpDFRes = dataSet.write("pcatalog.SFTP.DropZone.default.Files", sftpDF, options )

SFTP GIMEL GSQL

spark.sql("set gimel.sftp.username=bagopalan")
spark.sql("set gimel.sftp.password=*****")
spark.sql("set gimel.sftp.filetype=csv")
spark.sql("set gimel.sftp.file.location=bus_use.csv")
spark.sql("set header=false")
spark.sql("set gimel.jdbc.password.strategy=file")
spark.sql("set hdfsTempLocation=/tmp/basu")
spark.sql("set gimel.jdbc.p.file=hdfs://horton/user/bagopalan/pass.dat")

val newDF = com.paypal.gimel.scaas.GimelQueryProcessor.executeBatch("create table pcatalog.teradata.simba.pp_scratch.myTable1 as select * from pcatalog.SFTP.DropZone.default.Files",spark)

val sampleDF = com.paypal.gimel.scaas.GimelQueryProcessor.executeBatch("insert into pcatalog.teradata.simba.pp_scratch.myTable1 as select * from pcatalog.SFTP.DropZone.default.Files",spark)

// Here pcatalog.SFTP.DropZone.default.Files is pcatalog table created through gimel pcatalog UI
// This is created by pointing dropzone of paypal corp.


SFTP GIMEL READ WRITE FOR XML

// Read xml 
val options = Map("gimel.sftp.file.password.strategy" -> "file",
                        "gimel.sftp.file.password.source" -> "local",
                        "gimel.sftp.file.password.path" -> "/x/home/USER/mypass.txt",
                        "gimel.sftp.username" -> "username",
                        "header" -> "false",
                        "hdfsTempLocation" -> "hdfs://horton/user/username/",
                        "gimel.sftp.filetype" -> "xml",
                       "rowTag" -> "YEAR",
                         "gimel.sftp.file.location" -> "myxml.xml" );
val sftpDFRes = dataSet.read("pcatalog.SFTP.DropZone.default.Files", options )

// Write XML

val options = Map("gimel.sftp.file.password.strategy" -> "file",
                        "gimel.sftp.file.password.source" -> "local",
                        "gimel.sftp.file.password.path" -> "/x/home/USER/mypass.txt",
                        "gimel.sftp.username" -> "username",
                        "header" -> "false",
                        "hdfsTempLocation" -> "hdfs://horton/user/username/",
                        "gimel.sftp.filetype" -> "xml",
                       "rowTag" -> "YEAR",
                       "rootTag" -> "YEARS",
                         "gimel.sftp.file.location" -> "myxml.xml" );
dataSet.write("pcatalog.SFTP.DropZone.default.Files", sftpDFRes, options )



Limitations