profile
viewpoint

baby-bell/bbb_rfid_to_keyboard 0

RFID reader to keyboard interface

baby-bell/pie-classes 0

Class listings for PiE staff members

baby-bell/rfid_keyboard 0

Keyboard emulation for the BeagleBone Black

baby-bell/walkstop-phone 0

Audio tours using Twilio

pioneers/hibike_packet 0

C extension for hibike

pull request commentGuidewire/cda-client

Ps dm branch 1

There are some comments added to the requested changes. A few of the suggestions had issues and I was unable to get them to work so I reverted to original code. Comments were added for those in all cases.

The remaining items we have not commented on we do not have time to take on. Can the PR be accepted without those changes?

Also, we corrected some configuration validation issues around the Merged and Raw settings.

I will defer to @gw-vsasidharan here

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)

No, as in

if (fieldDataType == StringType) {
...
} else if {
...
} else {
...
}
cwilliams-gw

comment created time in a month

PullRequestReviewEvent

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"

OK, if it's not working just revert it to the previous form.

cwilliams-gw

comment created time in a month

PullRequestReviewEvent

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"

Ah right, it doesn't like that. You can use triple quotes.

val updateStatement = s"""UPDATE $tableName SET $colNamesForSetClause WHERE "id" = ?"""
cwilliams-gw

comment created time in a month

PullRequestReviewEvent

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }+          stmt.addBatch()+          rowCount += 1+          totalRowCount += 1+          if (rowCount % batchSize == 0) {+            stmt.executeBatch()+            log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+            rowCount = 0+          }+        }++        if (rowCount > 0) {+          stmt.executeBatch()+          log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+        }+      } finally {+        stmt.close()+      }+      completed = true+    } catch {+      case e: SQLException =>+        //log.info(s"Catch exception for $table - $updateStmt")

Remove commented code.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }+          stmt.addBatch()+          rowCount += 1+          totalRowCount += 1+          if (rowCount % batchSize == 0) {+            stmt.executeBatch()+            log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+            rowCount = 0+          }+        }++        if (rowCount > 0) {+          stmt.executeBatch()+          log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+        }+      } finally {+        stmt.close()+      }+      completed = true+    } catch {+      case e: SQLException =>+        //log.info(s"Catch exception for $table - $updateStmt")+        val cause = e.getCause+        val nextcause = e.getNextException+        if (nextcause != null && cause != nextcause) {+          // If there is no cause already, set 'next exception' as cause. If cause is null,+          // it *may* be because no cause was set yet+          if (cause == null) {+            try {+              e.initCause(nextcause)+            } catch {+              // Or it may be null because the cause *was* explicitly initialized, to *null*,+              // in which case this fails. There is no other way to detect it.+              // addSuppressed in this case as well.+              case _: IllegalStateException => e.addSuppressed(nextcause)+            }+          } else {+            e.addSuppressed(nextcause)+          }+        }+        throw e+    } finally {+      if (!completed) {+        // The stage must fail.  We got here through an exception path, so+        // let the exception through and tell the user about another problem.+        log.info(s"$jdbcWriteType - Update failed for $table - $updateStmt")+      } else {+        log.info(s"$jdbcWriteType - Total rows updated for $table: $totalRowCount rows - $updateStmt")+      }+    }+  }++  private def getJdbcType(dt: DataType, dialect: JdbcDialect): JdbcType = {+    dialect.getJDBCType(dt).orElse(getCommonJDBCType(dt)).getOrElse(+      throw new IllegalArgumentException(s"Can't get JDBC type for $dt.catalogString"))+  }++  /**+   * Retrieve standard jdbc types.+   *+   * @param dt The datatype (e.g. [[org.apache.spark.sql.types.StringType]])+   * @return The default JdbcType for this DataType+   */+  private def getCommonJDBCType(dt: DataType): Option[JdbcType] = {++    dt match {+      case IntegerType    => Option(JdbcType("INTEGER", java.sql.Types.INTEGER))+      case LongType       => Option(JdbcType("BIGINT", java.sql.Types.BIGINT))+      case DoubleType     => Option(JdbcType("DOUBLE PRECISION", java.sql.Types.DOUBLE))+      case FloatType      => Option(JdbcType("REAL", java.sql.Types.FLOAT))+      case ShortType      => Option(JdbcType("INTEGER", java.sql.Types.SMALLINT))+      case ByteType       => Option(JdbcType("BYTE", java.sql.Types.TINYINT))+      case BooleanType    => Option(JdbcType("BIT(1)", java.sql.Types.BIT))+      case StringType     => Option(JdbcType("TEXT", java.sql.Types.CLOB))+      case BinaryType     => Option(JdbcType("BLOB", java.sql.Types.BLOB))+      case TimestampType  => Option(JdbcType("TIMESTAMP", java.sql.Types.TIMESTAMP))+      case DateType       => Option(JdbcType("DATE", java.sql.Types.DATE))+      case t: DecimalType => Option(+        JdbcType(s"DECIMAL(${t.precision},${t.scale})", java.sql.Types.DECIMAL))+      case _              => None+    }+  }++  /**+   * A `JDBCValueSetter` is responsible for setting a value from `Row` into a field for+   * `PreparedStatement`. The last argument `Int` means the index for the value to be set+   * in the SQL statement and also used for the value in `Row`.+   * private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit+   */+  private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit++  private def makeSetter(+                          conn: Connection,+                          dialect: JdbcDialect,+                          dataType: DataType): JDBCValueSetter = dataType match {+    case IntegerType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getInt(pos))+    case LongType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setLong(pos + 1, row.getLong(pos))+    case DoubleType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDouble(pos + 1, row.getDouble(pos))+    case FloatType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setFloat(pos + 1, row.getFloat(pos))+    case ShortType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getShort(pos))+    case ByteType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getByte(pos))+    case BooleanType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBoolean(pos + 1, row.getBoolean(pos))+    case StringType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setString(pos + 1, row.getString(pos))+    case BinaryType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBytes(pos + 1, row.getAs[Array[Byte]](pos))+    case TimestampType    =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setTimestamp(pos + 1, row.getAs[java.sql.Timestamp](pos))+    case DateType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDate(pos + 1, row.getAs[java.sql.Date](pos))+    case t: DecimalType   =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBigDecimal(pos + 1, row.getDecimal(pos))+    case ArrayType(et, _) =>+      // remove type length parameters from end of type name+      val typeName = getJdbcType(et, dialect).databaseTypeDefinition+        .toLowerCase(Locale.ROOT).split("\\(")(0)+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        val array = conn.createArrayOf(+          typeName,+          row.getSeq[AnyRef](pos).toArray)+        stmt.setArray(pos + 1, array)+    case _                =>+      (_: PreparedStatement, _: Row, pos: Int) =>+        throw new IllegalArgumentException(+          s"Can't translate non-null value for field $pos")+  }++  /** Determine if table schema definition is the same as the parquet file schema definition.+   * If differences in columns, ADD or DROP necessary columns from database table to align definitions, and re-check+   * for a match between database and file schema definitions.+   *+   * @param fileDataFrame  based on the parquet file format+   * @param jdbcSchemaName database schema name+   * @param tableName      is the name of the database table we being compared to+   * @param url            database url+   * @param user           database user name+   * @param pswd           database password+   * @parm spark is the spark session.+   * @param jdbcWriteType Merge vs Raw to determine exclusion of internal 'gwcbi__' columns+   *                      when comparing schemas.  When merging data we remove those columns+   *                      from the data set before saving the data so we don't want to check+   *                      for them when comparing to the schema definition in the database.+   * @return Boolean indicating if the table schema definition is the same as the parquet file schema definition+   */+  //TODO Consider renaming this function (schemasAreConsistent) to indicate we're doing more than just checking for consistency+  def schemasAreConsistent(fileDataFrame: DataFrame, jdbcSchemaName: String, tableName: String, schemaFingerprint: String, url: String,+                           user: String, pswd: String, spark: SparkSession, jdbcWriteType: JdbcWriteType.Value): Boolean = {++    if (tableExists(tableName, url, user, pswd)) {+      // build a query that returns no data from the table.  This will still get us the schema definition which is all we need.+      val sql = "(select * from " + jdbcSchemaName + "." + tableName + " where 1=2) as " + tableName+      //      val sql = jdbcSchemaName + "." + tableName+      val tableDataFrame = spark.read.format("jdbc")+        .option("url", url)+        .option("dbtable", sql)+        .option("user", user)+        .option("password", pswd)+        .load()++      val dialect = JdbcDialects.get(url)++      //Derive the product name from the url to avoid having to create or pass in a connection+      // to access the metadata object.+      val dbProductName = if (url.toLowerCase.contains("sqlserver")) {+        "Microsoft SQL Server"+      }+      else {+        if (url.toLowerCase.contains("postgresql")) {+          "PostgreSQL"+        } else {+          if (url.toLowerCase.contains("oracle")) {+            "Oracle"+          }+        }+      }

I'm not sure what you want to do in the case that the URL contains none of those strings, or if that's even possible.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"
      val deleteStatement = s"DELETE FROM $tableName WHERE \"id\" = ?"
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }+          stmt.addBatch()+          rowCount += 1+          totalRowCount += 1+          if (rowCount % batchSize == 0) {+            stmt.executeBatch()+            log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+            rowCount = 0+          }+        }++        if (rowCount > 0) {+          stmt.executeBatch()+          log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+        }+      } finally {+        stmt.close()+      }+      completed = true+    } catch {+      case e: SQLException =>+        //log.info(s"Catch exception for $table - $updateStmt")+        val cause = e.getCause+        val nextcause = e.getNextException+        if (nextcause != null && cause != nextcause) {+          // If there is no cause already, set 'next exception' as cause. If cause is null,+          // it *may* be because no cause was set yet+          if (cause == null) {+            try {+              e.initCause(nextcause)+            } catch {+              // Or it may be null because the cause *was* explicitly initialized, to *null*,+              // in which case this fails. There is no other way to detect it.+              // addSuppressed in this case as well.+              case _: IllegalStateException => e.addSuppressed(nextcause)+            }+          } else {+            e.addSuppressed(nextcause)+          }+        }+        throw e+    } finally {+      if (!completed) {+        // The stage must fail.  We got here through an exception path, so+        // let the exception through and tell the user about another problem.+        log.info(s"$jdbcWriteType - Update failed for $table - $updateStmt")+      } else {+        log.info(s"$jdbcWriteType - Total rows updated for $table: $totalRowCount rows - $updateStmt")+      }+    }+  }++  private def getJdbcType(dt: DataType, dialect: JdbcDialect): JdbcType = {+    dialect.getJDBCType(dt).orElse(getCommonJDBCType(dt)).getOrElse(+      throw new IllegalArgumentException(s"Can't get JDBC type for $dt.catalogString"))+  }++  /**+   * Retrieve standard jdbc types.+   *+   * @param dt The datatype (e.g. [[org.apache.spark.sql.types.StringType]])+   * @return The default JdbcType for this DataType+   */+  private def getCommonJDBCType(dt: DataType): Option[JdbcType] = {++    dt match {+      case IntegerType    => Option(JdbcType("INTEGER", java.sql.Types.INTEGER))+      case LongType       => Option(JdbcType("BIGINT", java.sql.Types.BIGINT))+      case DoubleType     => Option(JdbcType("DOUBLE PRECISION", java.sql.Types.DOUBLE))+      case FloatType      => Option(JdbcType("REAL", java.sql.Types.FLOAT))+      case ShortType      => Option(JdbcType("INTEGER", java.sql.Types.SMALLINT))+      case ByteType       => Option(JdbcType("BYTE", java.sql.Types.TINYINT))+      case BooleanType    => Option(JdbcType("BIT(1)", java.sql.Types.BIT))+      case StringType     => Option(JdbcType("TEXT", java.sql.Types.CLOB))+      case BinaryType     => Option(JdbcType("BLOB", java.sql.Types.BLOB))+      case TimestampType  => Option(JdbcType("TIMESTAMP", java.sql.Types.TIMESTAMP))+      case DateType       => Option(JdbcType("DATE", java.sql.Types.DATE))+      case t: DecimalType => Option(+        JdbcType(s"DECIMAL(${t.precision},${t.scale})", java.sql.Types.DECIMAL))+      case _              => None+    }+  }++  /**+   * A `JDBCValueSetter` is responsible for setting a value from `Row` into a field for+   * `PreparedStatement`. The last argument `Int` means the index for the value to be set+   * in the SQL statement and also used for the value in `Row`.+   * private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit+   */+  private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit++  private def makeSetter(+                          conn: Connection,+                          dialect: JdbcDialect,+                          dataType: DataType): JDBCValueSetter = dataType match {+    case IntegerType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getInt(pos))+    case LongType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setLong(pos + 1, row.getLong(pos))+    case DoubleType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDouble(pos + 1, row.getDouble(pos))+    case FloatType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setFloat(pos + 1, row.getFloat(pos))+    case ShortType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getShort(pos))+    case ByteType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getByte(pos))+    case BooleanType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBoolean(pos + 1, row.getBoolean(pos))+    case StringType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setString(pos + 1, row.getString(pos))+    case BinaryType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBytes(pos + 1, row.getAs[Array[Byte]](pos))+    case TimestampType    =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setTimestamp(pos + 1, row.getAs[java.sql.Timestamp](pos))+    case DateType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDate(pos + 1, row.getAs[java.sql.Date](pos))+    case t: DecimalType   =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBigDecimal(pos + 1, row.getDecimal(pos))+    case ArrayType(et, _) =>+      // remove type length parameters from end of type name+      val typeName = getJdbcType(et, dialect).databaseTypeDefinition+        .toLowerCase(Locale.ROOT).split("\\(")(0)+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        val array = conn.createArrayOf(+          typeName,+          row.getSeq[AnyRef](pos).toArray)+        stmt.setArray(pos + 1, array)+    case _                =>+      (_: PreparedStatement, _: Row, pos: Int) =>+        throw new IllegalArgumentException(+          s"Can't translate non-null value for field $pos")+  }++  /** Determine if table schema definition is the same as the parquet file schema definition.+   * If differences in columns, ADD or DROP necessary columns from database table to align definitions, and re-check+   * for a match between database and file schema definitions.+   *+   * @param fileDataFrame  based on the parquet file format+   * @param jdbcSchemaName database schema name+   * @param tableName      is the name of the database table we being compared to+   * @param url            database url+   * @param user           database user name+   * @param pswd           database password+   * @parm spark is the spark session.+   * @param jdbcWriteType Merge vs Raw to determine exclusion of internal 'gwcbi__' columns+   *                      when comparing schemas.  When merging data we remove those columns+   *                      from the data set before saving the data so we don't want to check+   *                      for them when comparing to the schema definition in the database.+   * @return Boolean indicating if the table schema definition is the same as the parquet file schema definition+   */+  //TODO Consider renaming this function (schemasAreConsistent) to indicate we're doing more than just checking for consistency+  def schemasAreConsistent(fileDataFrame: DataFrame, jdbcSchemaName: String, tableName: String, schemaFingerprint: String, url: String,+                           user: String, pswd: String, spark: SparkSession, jdbcWriteType: JdbcWriteType.Value): Boolean = {++    if (tableExists(tableName, url, user, pswd)) {+      // build a query that returns no data from the table.  This will still get us the schema definition which is all we need.+      val sql = "(select * from " + jdbcSchemaName + "." + tableName + " where 1=2) as " + tableName+      //      val sql = jdbcSchemaName + "." + tableName+      val tableDataFrame = spark.read.format("jdbc")+        .option("url", url)+        .option("dbtable", sql)+        .option("user", user)+        .option("password", pswd)+        .load()++      val dialect = JdbcDialects.get(url)++      //Derive the product name from the url to avoid having to create or pass in a connection+      // to access the metadata object.+      val dbProductName = if (url.toLowerCase.contains("sqlserver")) {+        "Microsoft SQL Server"+      }+      else {+        if (url.toLowerCase.contains("postgresql")) {+          "PostgreSQL"+        } else {+          if (url.toLowerCase.contains("oracle")) {+            "Oracle"+          }+        }+      }
      val dbProductName = if (url.toLowerCase.contains("sqlserver")) {
        "Microsoft SQL Server"
      } else if (url.toLowerCase.contains("postgresql")) {
        "PostgreSQL"
      } else if (url.toLowerCase.contains("oracle")) {
        "Oracle"
      }
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType
      (tableName, fieldName) match {
        case ("cc_outboundrecord", "content") |
        ("cc_contactorigvalue", "origvalue") |
        ("pc_...", ...) |
        ... => largeStringDataType
        case _ => stringDataType
      }
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)

Add some braces around this if.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"
      val updateStatement = s"UPDATE $tableName SET $colNamesForSetClause WHERE \"id\" = ?"
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }+          stmt.addBatch()+          rowCount += 1+          totalRowCount += 1+          if (rowCount % batchSize == 0) {+            stmt.executeBatch()+            log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+            rowCount = 0+          }+        }++        if (rowCount > 0) {+          stmt.executeBatch()+          log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+        }+      } finally {+        stmt.close()+      }+      completed = true+    } catch {+      case e: SQLException =>+        //log.info(s"Catch exception for $table - $updateStmt")+        val cause = e.getCause+        val nextcause = e.getNextException+        if (nextcause != null && cause != nextcause) {+          // If there is no cause already, set 'next exception' as cause. If cause is null,+          // it *may* be because no cause was set yet+          if (cause == null) {+            try {+              e.initCause(nextcause)+            } catch {+              // Or it may be null because the cause *was* explicitly initialized, to *null*,+              // in which case this fails. There is no other way to detect it.+              // addSuppressed in this case as well.+              case _: IllegalStateException => e.addSuppressed(nextcause)+            }+          } else {+            e.addSuppressed(nextcause)+          }+        }+        throw e+    } finally {+      if (!completed) {+        // The stage must fail.  We got here through an exception path, so+        // let the exception through and tell the user about another problem.+        log.info(s"$jdbcWriteType - Update failed for $table - $updateStmt")+      } else {+        log.info(s"$jdbcWriteType - Total rows updated for $table: $totalRowCount rows - $updateStmt")+      }+    }+  }++  private def getJdbcType(dt: DataType, dialect: JdbcDialect): JdbcType = {+    dialect.getJDBCType(dt).orElse(getCommonJDBCType(dt)).getOrElse(+      throw new IllegalArgumentException(s"Can't get JDBC type for $dt.catalogString"))+  }++  /**+   * Retrieve standard jdbc types.+   *+   * @param dt The datatype (e.g. [[org.apache.spark.sql.types.StringType]])+   * @return The default JdbcType for this DataType+   */+  private def getCommonJDBCType(dt: DataType): Option[JdbcType] = {++    dt match {+      case IntegerType    => Option(JdbcType("INTEGER", java.sql.Types.INTEGER))+      case LongType       => Option(JdbcType("BIGINT", java.sql.Types.BIGINT))+      case DoubleType     => Option(JdbcType("DOUBLE PRECISION", java.sql.Types.DOUBLE))+      case FloatType      => Option(JdbcType("REAL", java.sql.Types.FLOAT))+      case ShortType      => Option(JdbcType("INTEGER", java.sql.Types.SMALLINT))+      case ByteType       => Option(JdbcType("BYTE", java.sql.Types.TINYINT))+      case BooleanType    => Option(JdbcType("BIT(1)", java.sql.Types.BIT))+      case StringType     => Option(JdbcType("TEXT", java.sql.Types.CLOB))+      case BinaryType     => Option(JdbcType("BLOB", java.sql.Types.BLOB))+      case TimestampType  => Option(JdbcType("TIMESTAMP", java.sql.Types.TIMESTAMP))+      case DateType       => Option(JdbcType("DATE", java.sql.Types.DATE))+      case t: DecimalType => Option(+        JdbcType(s"DECIMAL(${t.precision},${t.scale})", java.sql.Types.DECIMAL))+      case _              => None+    }+  }++  /**+   * A `JDBCValueSetter` is responsible for setting a value from `Row` into a field for+   * `PreparedStatement`. The last argument `Int` means the index for the value to be set+   * in the SQL statement and also used for the value in `Row`.+   * private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit+   */+  private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit++  private def makeSetter(+                          conn: Connection,+                          dialect: JdbcDialect,+                          dataType: DataType): JDBCValueSetter = dataType match {+    case IntegerType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getInt(pos))+    case LongType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setLong(pos + 1, row.getLong(pos))+    case DoubleType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDouble(pos + 1, row.getDouble(pos))+    case FloatType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setFloat(pos + 1, row.getFloat(pos))+    case ShortType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getShort(pos))+    case ByteType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getByte(pos))+    case BooleanType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBoolean(pos + 1, row.getBoolean(pos))+    case StringType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setString(pos + 1, row.getString(pos))+    case BinaryType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBytes(pos + 1, row.getAs[Array[Byte]](pos))+    case TimestampType    =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setTimestamp(pos + 1, row.getAs[java.sql.Timestamp](pos))+    case DateType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDate(pos + 1, row.getAs[java.sql.Date](pos))+    case t: DecimalType   =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBigDecimal(pos + 1, row.getDecimal(pos))+    case ArrayType(et, _) =>+      // remove type length parameters from end of type name+      val typeName = getJdbcType(et, dialect).databaseTypeDefinition+        .toLowerCase(Locale.ROOT).split("\\(")(0)+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        val array = conn.createArrayOf(+          typeName,+          row.getSeq[AnyRef](pos).toArray)+        stmt.setArray(pos + 1, array)+    case _                =>+      (_: PreparedStatement, _: Row, pos: Int) =>+        throw new IllegalArgumentException(+          s"Can't translate non-null value for field $pos")+  }++  /** Determine if table schema definition is the same as the parquet file schema definition.+   * If differences in columns, ADD or DROP necessary columns from database table to align definitions, and re-check+   * for a match between database and file schema definitions.+   *+   * @param fileDataFrame  based on the parquet file format+   * @param jdbcSchemaName database schema name+   * @param tableName      is the name of the database table we being compared to+   * @param url            database url+   * @param user           database user name+   * @param pswd           database password+   * @parm spark is the spark session.+   * @param jdbcWriteType Merge vs Raw to determine exclusion of internal 'gwcbi__' columns+   *                      when comparing schemas.  When merging data we remove those columns+   *                      from the data set before saving the data so we don't want to check+   *                      for them when comparing to the schema definition in the database.+   * @return Boolean indicating if the table schema definition is the same as the parquet file schema definition+   */+  //TODO Consider renaming this function (schemasAreConsistent) to indicate we're doing more than just checking for consistency+  def schemasAreConsistent(fileDataFrame: DataFrame, jdbcSchemaName: String, tableName: String, schemaFingerprint: String, url: String,+                           user: String, pswd: String, spark: SparkSession, jdbcWriteType: JdbcWriteType.Value): Boolean = {++    if (tableExists(tableName, url, user, pswd)) {+      // build a query that returns no data from the table.  This will still get us the schema definition which is all we need.+      val sql = "(select * from " + jdbcSchemaName + "." + tableName + " where 1=2) as " + tableName+      //      val sql = jdbcSchemaName + "." + tableName+      val tableDataFrame = spark.read.format("jdbc")+        .option("url", url)+        .option("dbtable", sql)+        .option("user", user)+        .option("password", pswd)+        .load()++      val dialect = JdbcDialects.get(url)++      //Derive the product name from the url to avoid having to create or pass in a connection+      // to access the metadata object.+      val dbProductName = if (url.toLowerCase.contains("sqlserver")) {+        "Microsoft SQL Server"+      }+      else {+        if (url.toLowerCase.contains("postgresql")) {+          "PostgreSQL"+        } else {+          if (url.toLowerCase.contains("oracle")) {+            "Oracle"+          }+        }+      }++      // Get the schema definition for the data read from the database table+      val tableSchemaDef = tableDataFrame.schema++      // Get the schema definition for the data read from the parquet file.+      val fileSchemaDef = if (jdbcWriteType == JdbcWriteType.Merged) {+        val dropList = fileDataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))+        fileDataFrame.drop(dropList: _*).schema+      } else {+        fileDataFrame.schema+      }++      //Determine if columns need to be added or removed from the database table based on+      // changes to the parquet file+      val fileMapSet = fileSchemaDef.map(rec => (rec.name, rec.dataType, rec.nullable)).toSet+      val tableMapSet = tableSchemaDef.map(rec => (rec.name, rec.dataType, rec.nullable)).toSet+      val newFileColumns = fileMapSet diff tableMapSet+      //val missingFileColumns = tableMapSet diff fileMapSet++      val databaseConnection = DriverManager.getConnection(url, user, pswd)+      databaseConnection.setAutoCommit(false)++      //ADD COLUMNS TO DATABASE TABLE THAT HAVE BEEN ADDED TO PARQUET FILE+      // Check to see if there are columns in the parquet file that are not in the database table.+      // If there are we are going to build the ALTER TABLE statement and execute the statement.+      if (!newFileColumns.isEmpty) {+        log.warn(s"New File Columns: ${newFileColumns.toString()}")+        for (columnDataFrameDefinition <- newFileColumns) {+          val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName.toString, tableName, columnDataFrameDefinition._1, columnDataFrameDefinition._2, columnDataFrameDefinition._3)+          val alterTableStatement = s"ALTER TABLE $jdbcSchemaName.$tableName ADD $columnDefinition"+          log.warn(s"Statement to be executed: $alterTableStatement")+          try {+            // Execute the table create DDL+            val stmt = databaseConnection.createStatement+            stmt.execute(alterTableStatement)+            stmt.close()+            databaseConnection.commit()+            log.warn(s"ALTER TABLE - SUCCESS '$tableName' for alter table statement $alterTableStatement - ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+          } catch {+            case e: Exception =>+              databaseConnection.rollback()+              databaseConnection.close()+              log.warn(s"ALTER TABLE - ROLLBACK '$tableName' for alter table statement $alterTableStatement - $e - ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+              throw e+          }++        }+      } else {+        log.warn(s"NO NEW FILE COLUMNS")+      }++      databaseConnection.close()++      // Generate the table create ddl statement based on the schema definition of the database table.+      val databaseDDL = getTableCreateDDL(dialect, tableSchemaDef, tableName, jdbcWriteType, dbProductName.toString)++      // Build the create ddl statement based on the data read from the parquet file.+      val fileDDL = getTableCreateDDL(dialect, fileSchemaDef, tableName, jdbcWriteType, dbProductName.toString)++      //Compare the two table definitions and log warnings if they do not match.+      // Added twist here - we need to check to see if columns had to be added or removed from the database table.+      // If we had to ADD or DROP columns from the database table, we need to rebuild the dataframe for the JDBC+      // connection and check for schema consistency using the new structures that we just performed ALTER TABLE on.+      // Since we handled all of the added or removed columns, any failure at this point will be on structure changes we+      // cannot handle via code, and the DDL differences will be logged during the second call to schemasAreConsistent.+      if (databaseDDL == fileDDL) {+        true+      } else { // instead of just logging "false", we need to check to see if there were table DDL changes executed+        if (!newFileColumns.isEmpty) {+          // check the schema comparison again, but now with the new table structure following ALTER statements+          val newComparison = schemasAreConsistent(fileDataFrame, jdbcSchemaName, tableName, schemaFingerprint, url, user, pswd, spark, jdbcWriteType)+          if (newComparison) { // if its fine, just return true+            true+          }+          else { // if there are still problems, return false - the second call to schemasAreConsistent will have logged any additional issues+            false+          }+        }+        else { //if there were not any ALTER statements to execute, just fail as normal and log message+          val logMsg = (s"""+                           |+                           |+                           | $jdbcWriteType table definition for '$tableName' does not match parquet fingerprint '$schemaFingerprint'.  Bypassing updates for fingerprint $schemaFingerprint.+                           |+                           | $tableName $jdbcWriteType DB Table Schema:+                           | ${"-" * (tableName.length + jdbcWriteType.toString.length + 18)}+                           | ${databaseDDL.stripPrefix(s"CREATE TABLE $tableName (").stripSuffix(")")}+                           |+                           | $tableName Parquet Schema for Fingerprint $schemaFingerprint:+                           | ${"-" * (tableName.length + schemaFingerprint.length + 33)}+                           | ${fileDDL.stripPrefix(s"CREATE TABLE $tableName (").stripSuffix(")")}+                           |""")+          log.warn(logMsg)+          log.warn(s"Database Table Schema Definition: $tableSchemaDef")+          log.warn(s"File Schema Definition: $fileSchemaDef")+          false+        }+      }+    }+    else {+      true+    }+  }++  def tableExists(tableName: String, url: String, user: String, pswd: String): Boolean = {+    val connection = DriverManager.getConnection(url, user, pswd)+    val dbm = connection.getMetaData+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableName, Array("TABLE"))+    if (tables.next) {+      connection.close()+      true+    } else {+      connection.close()+      false+    }
      try {
        return tables.next != null
      } finally
        connection.close()
      }
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()
    val updateCount = UpdateDF.count()

Why the abbreviation?

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "
      var ddlPK = s"ALTER TABLE $tableName ADD CONSTRAINT $tableNameNoSchema _pk PRIMARY KEY "
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame

Looks like a variable, so make it camelCase

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }+          stmt.addBatch()+          rowCount += 1+          totalRowCount += 1+          if (rowCount % batchSize == 0) {+            stmt.executeBatch()+            log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+            rowCount = 0+          }+        }++        if (rowCount > 0) {+          stmt.executeBatch()+          log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+        }+      } finally {+        stmt.close()+      }+      completed = true+    } catch {+      case e: SQLException =>+        //log.info(s"Catch exception for $table - $updateStmt")+        val cause = e.getCause+        val nextcause = e.getNextException+        if (nextcause != null && cause != nextcause) {+          // If there is no cause already, set 'next exception' as cause. If cause is null,+          // it *may* be because no cause was set yet+          if (cause == null) {+            try {+              e.initCause(nextcause)+            } catch {+              // Or it may be null because the cause *was* explicitly initialized, to *null*,+              // in which case this fails. There is no other way to detect it.+              // addSuppressed in this case as well.+              case _: IllegalStateException => e.addSuppressed(nextcause)+            }+          } else {+            e.addSuppressed(nextcause)+          }+        }+        throw e+    } finally {+      if (!completed) {+        // The stage must fail.  We got here through an exception path, so+        // let the exception through and tell the user about another problem.+        log.info(s"$jdbcWriteType - Update failed for $table - $updateStmt")+      } else {+        log.info(s"$jdbcWriteType - Total rows updated for $table: $totalRowCount rows - $updateStmt")+      }+    }+  }++  private def getJdbcType(dt: DataType, dialect: JdbcDialect): JdbcType = {+    dialect.getJDBCType(dt).orElse(getCommonJDBCType(dt)).getOrElse(+      throw new IllegalArgumentException(s"Can't get JDBC type for $dt.catalogString"))+  }++  /**+   * Retrieve standard jdbc types.+   *+   * @param dt The datatype (e.g. [[org.apache.spark.sql.types.StringType]])+   * @return The default JdbcType for this DataType+   */+  private def getCommonJDBCType(dt: DataType): Option[JdbcType] = {++    dt match {+      case IntegerType    => Option(JdbcType("INTEGER", java.sql.Types.INTEGER))+      case LongType       => Option(JdbcType("BIGINT", java.sql.Types.BIGINT))+      case DoubleType     => Option(JdbcType("DOUBLE PRECISION", java.sql.Types.DOUBLE))+      case FloatType      => Option(JdbcType("REAL", java.sql.Types.FLOAT))+      case ShortType      => Option(JdbcType("INTEGER", java.sql.Types.SMALLINT))+      case ByteType       => Option(JdbcType("BYTE", java.sql.Types.TINYINT))+      case BooleanType    => Option(JdbcType("BIT(1)", java.sql.Types.BIT))+      case StringType     => Option(JdbcType("TEXT", java.sql.Types.CLOB))+      case BinaryType     => Option(JdbcType("BLOB", java.sql.Types.BLOB))+      case TimestampType  => Option(JdbcType("TIMESTAMP", java.sql.Types.TIMESTAMP))+      case DateType       => Option(JdbcType("DATE", java.sql.Types.DATE))+      case t: DecimalType => Option(+        JdbcType(s"DECIMAL(${t.precision},${t.scale})", java.sql.Types.DECIMAL))+      case _              => None+    }+  }++  /**+   * A `JDBCValueSetter` is responsible for setting a value from `Row` into a field for+   * `PreparedStatement`. The last argument `Int` means the index for the value to be set+   * in the SQL statement and also used for the value in `Row`.+   * private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit+   */+  private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit++  private def makeSetter(+                          conn: Connection,+                          dialect: JdbcDialect,+                          dataType: DataType): JDBCValueSetter = dataType match {+    case IntegerType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getInt(pos))+    case LongType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setLong(pos + 1, row.getLong(pos))+    case DoubleType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDouble(pos + 1, row.getDouble(pos))+    case FloatType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setFloat(pos + 1, row.getFloat(pos))+    case ShortType        =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getShort(pos))+    case ByteType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setInt(pos + 1, row.getByte(pos))+    case BooleanType      =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBoolean(pos + 1, row.getBoolean(pos))+    case StringType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setString(pos + 1, row.getString(pos))+    case BinaryType       =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBytes(pos + 1, row.getAs[Array[Byte]](pos))+    case TimestampType    =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setTimestamp(pos + 1, row.getAs[java.sql.Timestamp](pos))+    case DateType         =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setDate(pos + 1, row.getAs[java.sql.Date](pos))+    case t: DecimalType   =>+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        stmt.setBigDecimal(pos + 1, row.getDecimal(pos))+    case ArrayType(et, _) =>+      // remove type length parameters from end of type name+      val typeName = getJdbcType(et, dialect).databaseTypeDefinition+        .toLowerCase(Locale.ROOT).split("\\(")(0)+      (stmt: PreparedStatement, row: Row, pos: Int) =>+        val array = conn.createArrayOf(+          typeName,+          row.getSeq[AnyRef](pos).toArray)+        stmt.setArray(pos + 1, array)+    case _                =>+      (_: PreparedStatement, _: Row, pos: Int) =>+        throw new IllegalArgumentException(+          s"Can't translate non-null value for field $pos")+  }++  /** Determine if table schema definition is the same as the parquet file schema definition.+   * If differences in columns, ADD or DROP necessary columns from database table to align definitions, and re-check+   * for a match between database and file schema definitions.+   *+   * @param fileDataFrame  based on the parquet file format+   * @param jdbcSchemaName database schema name+   * @param tableName      is the name of the database table we being compared to+   * @param url            database url+   * @param user           database user name+   * @param pswd           database password+   * @parm spark is the spark session.+   * @param jdbcWriteType Merge vs Raw to determine exclusion of internal 'gwcbi__' columns+   *                      when comparing schemas.  When merging data we remove those columns+   *                      from the data set before saving the data so we don't want to check+   *                      for them when comparing to the schema definition in the database.+   * @return Boolean indicating if the table schema definition is the same as the parquet file schema definition+   */+  //TODO Consider renaming this function (schemasAreConsistent) to indicate we're doing more than just checking for consistency+  def schemasAreConsistent(fileDataFrame: DataFrame, jdbcSchemaName: String, tableName: String, schemaFingerprint: String, url: String,+                           user: String, pswd: String, spark: SparkSession, jdbcWriteType: JdbcWriteType.Value): Boolean = {++    if (tableExists(tableName, url, user, pswd)) {+      // build a query that returns no data from the table.  This will still get us the schema definition which is all we need.+      val sql = "(select * from " + jdbcSchemaName + "." + tableName + " where 1=2) as " + tableName
      val sql = s"(select * from $jdbcSchemaName.$tableName where 1=2) as $tableName"
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()

camel case it

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)

Expand this out to multiple lines.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "
        var ddlAK1 = s"ALTER TABLE $tableName ADD CONSTRAINT $tableNameNoSchema _ak1 UNIQUE "
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true
      val nullable = !notNullCols.contains(field.name) && field.nullable
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"
      case "Microsoft SQL Server" | "PostgreSQL" => "VARCHAR(1333)"
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""
    val nullableQualifier = if (!fieldNullable) "NOT NULL" else ""

In other parts of the code, "nullable" is a boolean. I think this name expresses the intent more clearly.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>

If I understand correctly, you're trying to do some batching logic. There's this handy iterator method called grouped that will do that for you.

df.collect().grouped(batchSize).foreach { batch =>
  batch.foreach { row =>
    ...
    stmt.addBatch()
    totalRowCount += 1
  }
  stmt.executeBatch()
  log.info(...)
}
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {   //Instance vars that are passed in during constructor of the concrete classes   val outputPath: String   val includeColumnNames: Boolean-  val saveAsSingleFileCSV: Boolean+  val saveAsSingleFile: Boolean   val saveIntoTimestampDirectory: Boolean+  val clientConfig: ClientConfig++  object JdbcWriteType extends Enumeration {+    type JdbcWriteType = Value++    val Raw = Value("Raw")+    val Merged = Value("Merged")+  }    /** Validate the outputPath, making sure that it exists/is a valid directory.-    * If there is a problem, throw an exception.-    *-    * In the case of local output, makes sure that the outputPath is a directory-    * that exists and is not a file.-    *-    * In the case of S3 output, makes sure that the outputPath is in an existing-    * S3 bucket and is also not an existing key to a S3 object.-    *-    * @throws java.io.IOException If the validation was not successful-    *-    */+   * If there is a problem, throw an exception.+   *+   * In the case of local output, makes sure that the outputPath is a directory+   * that exists and is not a file.+   *+   * In the case of S3 output, makes sure that the outputPath is in an existing+   * S3 bucket and is also not an existing key to a S3 object.+   *+   * @throws java.io.IOException If the validation was not successful+   *+   */   def validate(): Unit -  /** Write a table and its schema to either local filesystem or to S3.-    *-    * @param tableDataFrameWrapperForMicroBatch has the data to be written-    */+  /** Write a table and its schema to either local filesystem or to S3+   * and also to JDBC as indicated by clientConfig settings.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */   def write(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {     val tableName = tableDataFrameWrapperForMicroBatch.tableName-    log.info(s"Writing '$tableName' DataFrame as CSV to ${this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)}")-    this.writeCSV(tableDataFrameWrapperForMicroBatch)-    this.writeSchema(tableDataFrameWrapperForMicroBatch)-    log.info(s"Wrote '$tableName' DataFrame as CSV complete, with columns ${tableDataFrameWrapperForMicroBatch.dataFrame.columns.toList}")++    // Process file write.+    if (clientConfig.outputSettings.saveIntoFile) {+      if (clientConfig.outputSettings.fileFormat.toLowerCase == "csv") {+        log.info(s"Writing '$tableName' DataFrame as CSV to ${this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)}")+        this.writeCSV(tableDataFrameWrapperForMicroBatch)+        this.writeSchema(tableDataFrameWrapperForMicroBatch)+        log.info(s"Wrote '$tableName' DataFrame as CSV complete, with columns ${tableDataFrameWrapperForMicroBatch.dataFrame.columns.toList}")+      } else { //parquet+        log.info(s"Writing '$tableName' DataFrame as PARQUET to ${this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)}")+        this.writeParquet(tableDataFrameWrapperForMicroBatch)+        log.info(s"Wrote '$tableName' DataFrame as PARQUET complete")+      }+    }++    if (clientConfig.outputSettings.saveIntoJdbcRaw && clientConfig.outputSettings.saveIntoJdbcMerged) {

Probably not the best way to go about this, but this block contains a large amount of duplication.

val maybeRawConn = if (clientConfig.outputSettings.saveIntoJdbcRaw) {
  Some(DriverManager.getConnection(...))
} else {
  None
}
val maybeMergedConn = if (clientConfig.outputSettings.saveIntoJdbcMerged) ...
maybeRawConn.foreach(_.setAutoCommit(false))
maybeMergedConn.foreach(_.setAutoCommit(false))
maybeRawConn.foreach(rawConn => {
  try {
    this.writeJdbcRaw(...)
  } catch {
    ...
  }
})
...
maybeRawConn.foreach(_.commit())
...
maybeRawConn.foreach(_.close())

You get the idea. Basically, move the branches into the type. An option is just a special kind of iterator with 0 or 1 items, so foreach will only execute if it is Some.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }
          for (i <- 1 until numFields) {
            if (row.isNullAt(i)) {
              stmt.setNull(i + 1, nullTypes(i))
            } else {
              setters(i).apply(stmt, row, i)
            }
          }
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }+          stmt.addBatch()+          rowCount += 1+          totalRowCount += 1+          if (rowCount % batchSize == 0) {+            stmt.executeBatch()+            log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+            rowCount = 0+          }+        }++        if (rowCount > 0) {+          stmt.executeBatch()+          log.info(s"$jdbcWriteType - executeBatch - ${rowCount.toString} rows - $updateStmt")+        }+      } finally {+        stmt.close()+      }+      completed = true+    } catch {+      case e: SQLException =>+        //log.info(s"Catch exception for $table - $updateStmt")+        val cause = e.getCause+        val nextcause = e.getNextException+        if (nextcause != null && cause != nextcause) {+          // If there is no cause already, set 'next exception' as cause. If cause is null,+          // it *may* be because no cause was set yet+          if (cause == null) {+            try {+              e.initCause(nextcause)+            } catch {+              // Or it may be null because the cause *was* explicitly initialized, to *null*,+              // in which case this fails. There is no other way to detect it.+              // addSuppressed in this case as well.+              case _: IllegalStateException => e.addSuppressed(nextcause)+            }+          } else {+            e.addSuppressed(nextcause)+          }+        }+        throw e+    } finally {+      if (!completed) {+        // The stage must fail.  We got here through an exception path, so+        // let the exception through and tell the user about another problem.+        log.info(s"$jdbcWriteType - Update failed for $table - $updateStmt")+      } else {+        log.info(s"$jdbcWriteType - Total rows updated for $table: $totalRowCount rows - $updateStmt")+      }+    }+  }++  private def getJdbcType(dt: DataType, dialect: JdbcDialect): JdbcType = {+    dialect.getJDBCType(dt).orElse(getCommonJDBCType(dt)).getOrElse(+      throw new IllegalArgumentException(s"Can't get JDBC type for $dt.catalogString"))+  }++  /**+   * Retrieve standard jdbc types.+   *+   * @param dt The datatype (e.g. [[org.apache.spark.sql.types.StringType]])+   * @return The default JdbcType for this DataType+   */+  private def getCommonJDBCType(dt: DataType): Option[JdbcType] = {++    dt match {+      case IntegerType    => Option(JdbcType("INTEGER", java.sql.Types.INTEGER))

Substitute Some for Option here, since these values are known to not be null.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))+      .cache()++    // Log total rows marked as updates.+    val UpdCnt = UpdateDF.count()+    log.info(s"Merged - $tableName update cnt after filter: ${UpdCnt.toString}")++    // Generate and apply update statements based on the latest transaction for each id.+    if (UpdCnt > 0) {++      // Get the list of columns+      val colNamesArray = UpdateDF.columns.toBuffer+      // Remove the id and sequence from the list of columns so we can handle them separately.+      // We will be grouping by the id and the sequence will be set as the first item in the list of columns.+      colNamesArray --= Array("id", "gwcbi___seqval_hex")+      val colNamesString = "gwcbi___seqval_hex, " + colNamesArray.mkString(",")++      val latestChangeForEachID = if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Find the latest change for each id based on the gwcbi___seqval_hex.+        // Note: For nested structs, max on struct is computed as+        // max on first struct field, if equal fall back to second fields, and so on.+        // In this case the first struct field is gwcbi___seqval_hex which will be always+        // be unique for each instance of an id in the group.+        UpdateDF+          .selectExpr(Seq("id", s"struct($colNamesString) as otherCols"): _*)+          .groupBy("id").agg(sqlfun.max("otherCols").as("latest"))+          .selectExpr("latest.*", "id")+          .drop(dropList: _*)+          .cache()+      } else {+        // Retain all updates.  Sort so they are applied in the correct order.+        val colVar = colNamesString + ",id"+        UpdateDF+          .selectExpr(colVar.split(","): _*)+          .sort(col("gwcbi___seqval_hex").asc)+          .drop(dropList: _*)+          .cache()+      }+      UpdateDF.unpersist()++      val latestUpdCnt = latestChangeForEachID.count()+      if (clientConfig.jdbcConnectionMerged.jdbcApplyLatestUpdatesOnly) {+        // Log row count following the reduction to only last update for each id.+        log.info(s"Merged - $tableName update cnt after agg to get latest for each id: ${latestUpdCnt.toString}")+      } else {+        log.info(s"Merged - $tableName all updates will be applied in sequence.")+      }++      // Build the sql Update statement to be used as a prepared statement for the Updates.+      val colListForSetClause = latestChangeForEachID.columns.filter(_ != "id")+      val colNamesForSetClause = colListForSetClause.map("\"" + _ + "\" = ?").mkString(", ")+      val updateStatement = "UPDATE " + tableName + " SET " + colNamesForSetClause + " WHERE \"id\" = ?"+      log.info(s"Merged - $updateStatement")++      // Get schema info required for updatePartition call.+      val updateSchema = latestChangeForEachID.schema++      // Prepare and execute one update statement per row in our update dataframe.+      updateDataframe(connection, tableName, latestChangeForEachID, updateSchema, updateStatement, batchSize, dialect, JdbcWriteType.Merged)++      latestChangeForEachID.unpersist()+    }++    // Filter for records to be deleted.+    // Deletes should be relatively rare since most data is retired in InsuranceSuite rather than deleted.+    val DeleteDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(1))+      .selectExpr("id")+      .cache()++    // Log number of records to be deleted.+    val delCnt = DeleteDF.count()+    log.info(s"Merged - $tableName delete cnt after filter: ${delCnt.toString}")++    // Generate and apply delete statements.+    if (delCnt > 0) {+      val deleteSchema = DeleteDF.schema+      // Build the sql Delete statement to be used as a prepared statement for the Updates.+      val deleteStatement = "DELETE FROM " + tableName + " WHERE \"id\" = ?"+      log.info(s"Merged - $deleteStatement")++      // Prepare and execute one delete statement per row in our delete dataframe.+      updateDataframe(connection, tableName, DeleteDF, deleteSchema, deleteStatement, batchSize, dialect, JdbcWriteType.Merged)++      tableDataFrameWrapperForMicroBatch.dataFrame.unpersist()+      DeleteDF.unpersist()+    }+    log.info(s"+++ Finished merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")+  }++  /** Build and return a table create DDL statement based on the given schema definition.+   *+   * @param dialect+   * @param schema+   * @param tableName+   * @param jdbcWriteType+   * @param dbProductName+   * @return Table create DDL statement for the given table.+   */+  def getTableCreateDDL(dialect: JdbcDialect, schema: StructType, tableName: String, jdbcWriteType: JdbcWriteType.Value, dbProductName: String): String = {+    val allTableColumnsDefinitions = new StringBuilder()++    //TODO Should this be set up as defined list(s)?+    // Define specific columns we want to set as NOT NULL. Everything coming out of CDA parquet files is defined as nullable so+    // we do this to ensure there are columns available to set as PKs and/or AKs.+    var notNullCols = List("id", "gwcbi___operation", "gwcbi___seqval_hex")+    if (jdbcWriteType == JdbcWriteType.Merged) {+      // For merged data, include publicid, retired, and typecode in list of not null columns+      // so they can be included in unique index definitions.+      notNullCols = notNullCols ++ List("publicid", "retired", "typecode")+    }+    // Build the list of columns in alphabetic order.+    schema.fields.sortBy(f => f.name).foreach { field =>+      val nullable = if (notNullCols.contains(field.name) || !field.nullable) false else true+      val name = dialect.quoteIdentifier(field.name)+      val columnDefinition = buildDDLColumnDefinition(dialect, dbProductName, tableName, name, field.dataType, nullable)+      allTableColumnsDefinitions.append(s"$columnDefinition, ")+    }+    // Remove the trailing comma.+    val colsForCreateDDL = allTableColumnsDefinitions.stripSuffix(", ")+    // Build and return the final create table statement.+    s"CREATE TABLE $tableName ($colsForCreateDDL)"+  }++  /** Build and return a column definition to be used in CREATE and ALTER DDL statements.+   *+   * @param dialect+   * @param dbProductName+   * @param tableName+   * @param fieldName+   * @param fieldDataType+   * @param fieldNullable+   * @return Column definition - COLUMN_NAME TYPE_DECLARATION NULLABLE (i.e. '"ColumnName" VARCHAR(1333) NOT NULL').+   */+  def buildDDLColumnDefinition(dialect: JdbcDialect, dbProductName: String, tableName: String, fieldName: String, fieldDataType: DataType, fieldNullable: Boolean): String = {+    val columnDefinition = new StringBuilder()++    // TODO Consider making gwcbi___seqval_hex a smaller varchar than (1333) since it is part of a clustered index+    // Explicitly set the data type for string data to avoid nvarchar(max) and varchar2 types that are potentially too long or short.+    // nvarchar(max) columns can't be indexed.  Oracle JDBC converts the string datatype to VARCHAR2(255) which is potentially too short.+    val stringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(1333)"+      case "PostgreSQL"           => "VARCHAR(1333)"+      case "Oracle"               => "VARCHAR2(1333)"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for string data we need to handle very large text columns that we know of to avoid truncation sql exceptions.+    val largeStringDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARCHAR(max)"+      case "PostgreSQL"           => "VARCHAR"+      case "Oracle"               => "VARCHAR2(32767)" // requires MAX_STRING_SIZE Oracle parameter to be set to EXTENDED.+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    // Also for BLOB data we need to handle differently for different platforms.+    val blobDataType = dbProductName match {+      case "Microsoft SQL Server" => "VARBINARY(max)"+      case "PostgreSQL"           => "bytea"+      case "Oracle"               => "BLOB"+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val fieldDataTypeDefinition = if (fieldDataType == StringType)+    // TODO Consider making the determination for the need for very large text columns configurable.+    // These are the OOTB columns we have found so far.+      if ((tableName.equals("cc_outboundrecord") && fieldName.equals("content"))+        || (tableName.equals("cc_contactorigvalue") && fieldName.equals("origvalue"))+        || (tableName.equals("pc_diagratingworksheet") && fieldName.equals("diagnosticcapture"))+        || (tableName.equals("cc_note") && fieldName.equals("body"))+      ) largeStringDataType+      else stringDataType+    else if (fieldDataType == BinaryType) blobDataType+    else getJdbcType(fieldDataType, dialect).databaseTypeDefinition+    val nullable = if (!fieldNullable) "NOT NULL" else ""+    columnDefinition.append(s"$fieldName $fieldDataTypeDefinition $nullable")+    columnDefinition.toString()+  }++  private def updateDataframe(conn: Connection,+                              table: String,+                              df: DataFrame,+                              rddSchema: StructType,+                              updateStmt: String,+                              batchSize: Int,+                              dialect: JdbcDialect,+                              jdbcWriteType: JdbcWriteType.Value+                             ): Unit = {+    var completed = false+    var totalRowCount = 0L+    val dbProductName = conn.getMetaData.getDatabaseProductName+    try {+      val stmt = conn.prepareStatement(updateStmt)+      val setters = rddSchema.fields.map(f => makeSetter(conn, dialect, f.dataType))+      //For Oracle only - map nullTypes to TINYINT for Boolean to work around Oracle JDBC driver issues+      val nullTypes = rddSchema.fields.map(f => if (dbProductName == "Oracle" && f.dataType == BooleanType) JdbcType("BYTE", java.sql.Types.TINYINT).jdbcNullType else getJdbcType(f.dataType, dialect).jdbcNullType)+      val numFields = rddSchema.fields.length++      try {+        var rowCount = 0++        df.collect().foreach { row =>+          var i = 0+          while (i < numFields) {+            if (row.isNullAt(i)) {+              stmt.setNull(i + 1, nullTypes(i))+            } else {+              setters(i).apply(stmt, row, i)+            }+            i = i + 1+          }

also remove var i = 0

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))

camel case this

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next()++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Raw, dbProductName)+      // Execute the table create DDL+      val stmt = connection.createStatement+      log.info(s"Raw - $createTableDDL")+      stmt.execute(createTableDDL)+      stmt.close()+      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Raw)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Raw - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Raw)++    log.info(s"*** Finished writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")+  }++  /**+   * @param connection    database connection+   * @param url           used to determine db platform+   * @param tableName     name of the table without the schema prefix+   * @param jdbcWriteType indicates Raw or Merged data write type+   */+  private def createIndexes(connection: Connection, url: String, tableName: String, jdbcWriteType: JdbcWriteType.Value): Unit = {+    val stmt = connection.createStatement+    val tableNameNoSchema = tableName.substring(tableName.indexOf(".") + 1)+    if (url.toLowerCase.contains("sqlserver") || url.toLowerCase.contains("postgresql") || url.toLowerCase.contains("oracle")) {++      // Create primary key.+      var ddlPK = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_pk PRIMARY KEY "+      if (jdbcWriteType == JdbcWriteType.Merged) {+        ddlPK = ddlPK + "(\"id\")"+      }+      else {+        ddlPK = ddlPK + "(\"id\", \"gwcbi___seqval_hex\")"+      }+      log.info(s"$jdbcWriteType - $ddlPK")+      stmt.execute(ddlPK)++      // Create alternate keys for Merged data.  Raw data will not have any alternate keys since columns other than+      // the PK can be null (due to records for deletes).+      if (jdbcWriteType == JdbcWriteType.Merged) {+        var ddlAK1 = "ALTER TABLE " + tableName + " ADD CONSTRAINT " + tableNameNoSchema + "_ak1 UNIQUE "+        if (tableNameNoSchema.startsWith("pctl_") || tableNameNoSchema.startsWith("cctl_") || tableNameNoSchema.startsWith("bctl_") || tableNameNoSchema.startsWith("abtl_")) {+          ddlAK1 = ddlAK1 + "(\"typecode\")"+        }+        else {+          ddlAK1 = ddlAK1 + "(\"publicid\")"+        }+        log.info(s"$jdbcWriteType - $ddlAK1")+        stmt.execute(ddlAK1)+      }++    } else {+      log.info(s"Unsupported database.  $url. Indexes were not created.")+      stmt.close()+      throw new SQLException(s"Unsupported database platform: $url")+    }++    stmt.close()+  }++  /** Merge the raw transactions into a JDBC target database applying the inserts/updates/deletes+   * according to transactions in the raw CDC data.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcMerged(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    log.info(s"+++ Merging '${tableDataFrameWrapperForMicroBatch.tableName}' data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionMerged.jdbcUrl}")++    val tableName = clientConfig.jdbcConnectionMerged.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName++    tableDataFrameWrapperForMicroBatch.dataFrame.cache()++    // Get list of CDA internal use columns to get rid of.+    val dropList = tableDataFrameWrapperForMicroBatch.dataFrame.columns.filter(colName => colName.toLowerCase.startsWith("gwcbi___"))++    // Log total rows to be merged for this fingerprint.+    val totCnt = tableDataFrameWrapperForMicroBatch.dataFrame.count()+    log.info(s"Merged - $tableName total cnt for all ins/upd/del: ${totCnt.toString}")++    // Filter for records to insert and drop unwanted columns.+    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(2, 0))+      .drop(dropList: _*)+      .cache()++    // Log total rows to be inserted for this fingerprint.+    val insCnt = InsertDF.count()+    log.info(s"Merged - $tableName insert cnt after filter: ${insCnt.toString}")++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionMerged.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName+    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")+    }+    val tables = dbm.getTables(connection.getCatalog(), connection.getSchema(), tableNameCaseSensitive, Array("TABLE"))+    val tableExists = tables.next++    // Get some data we will need for later.+    val dialect = JdbcDialects.get(url)+    val insertSchema = InsertDF.schema+    val batchSize = 5000 // TODO consider making this configurable.++    // Create the table if it does not already exist.+    if (!tableExists) {+      // Build create table statement.+      val createTableDDL = getTableCreateDDL(dialect, insertSchema, tableName, JdbcWriteType.Merged, dbProductName)+      log.info(s"Merged - $createTableDDL")+      // Execute the table create DDL+      val stmt = connection.createStatement+      stmt.execute(createTableDDL)+      stmt.close()++      // Create table indexes for the new table.+      createIndexes(connection, url, tableName, JdbcWriteType.Merged)+      connection.commit()+    }++    // Build the insert statement.+    val columns = insertSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")+    val placeholders = insertSchema.fields.map(_ => "?").mkString(",")+    val insertStatement = s"INSERT INTO $tableName ($columns) VALUES ($placeholders)"+    log.info(s"Merged - $insertStatement")++    // Prepare and execute one insert statement per row in our insert dataframe.+    updateDataframe(connection, tableName, InsertDF, insertSchema, insertStatement, batchSize, dialect, JdbcWriteType.Merged)++    InsertDF.unpersist()++    // Filter for records to update.+    val UpdateDF = tableDataFrameWrapperForMicroBatch.dataFrame.filter(col("gwcbi___operation").isin(4))

camel case this

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp+    val tableNameNoSchema = tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp++    log.info(s"*** Writing '${tableDataFrameWrapperForMicroBatch.tableName}' raw data for fingerprint ${tableDataFrameWrapperForMicroBatch.schemaFingerprint} as JDBC to ${clientConfig.jdbcConnectionRaw.jdbcUrl}")++    val InsertDF = tableDataFrameWrapperForMicroBatch.dataFrame+    InsertDF.cache()++    // Determine if we need to create the table by checking if the table already exists.+    val url = clientConfig.jdbcConnectionRaw.jdbcUrl+    val dbm = connection.getMetaData+    val dbProductName = dbm.getDatabaseProductName++    val tableNameCaseSensitive = dbProductName match {+      case "Microsoft SQL Server" => tableNameNoSchema+      case "PostgreSQL"           => tableNameNoSchema+      case "Oracle"               => tableNameNoSchema.toUpperCase+      case _                      => throw new SQLException(s"Unsupported database platform: $dbProductName")
      case "Microsoft SQL Server" | "PostgreSQL" => tableNameNoSchema
      case "Oracle"                              => tableNameNoSchema.toUpperCase
      case _                                     => throw new SQLException(s"Unsupported database platform: $dbProductName")
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {     }   } +  /** Write a table to a Parquet.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  def writeParquet(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {+    val pathToFolderWithParquet = this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)+    if (saveAsSingleFile) {+      tableDataFrameWrapperForMicroBatch.dataFrame+        .coalesce(1)+        .write.mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    } else {+      tableDataFrameWrapperForMicroBatch.dataFrame.write+        .mode(SaveMode.Overwrite)+        .parquet(pathToFolderWithParquet)+    }+  }++  /** Write RAW data to a JDBC target database.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */+  private def writeJdbcRaw(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch, connection: Connection): Unit = {++    val tableName = clientConfig.jdbcConnectionRaw.jdbcSchema + "." + tableDataFrameWrapperForMicroBatch.tableName // + "_" + tableDataFrameWrapperForMicroBatch.schemaFingerprintTimestamp

Remove comments here.

cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 trait OutputWriter {   //Instance vars that are passed in during constructor of the concrete classes   val outputPath: String   val includeColumnNames: Boolean-  val saveAsSingleFileCSV: Boolean+  val saveAsSingleFile: Boolean   val saveIntoTimestampDirectory: Boolean+  val clientConfig: ClientConfig++  object JdbcWriteType extends Enumeration {+    type JdbcWriteType = Value++    val Raw = Value("Raw")+    val Merged = Value("Merged")+  }    /** Validate the outputPath, making sure that it exists/is a valid directory.-    * If there is a problem, throw an exception.-    *-    * In the case of local output, makes sure that the outputPath is a directory-    * that exists and is not a file.-    *-    * In the case of S3 output, makes sure that the outputPath is in an existing-    * S3 bucket and is also not an existing key to a S3 object.-    *-    * @throws java.io.IOException If the validation was not successful-    *-    */+   * If there is a problem, throw an exception.+   *+   * In the case of local output, makes sure that the outputPath is a directory+   * that exists and is not a file.+   *+   * In the case of S3 output, makes sure that the outputPath is in an existing+   * S3 bucket and is also not an existing key to a S3 object.+   *+   * @throws java.io.IOException If the validation was not successful+   *+   */   def validate(): Unit -  /** Write a table and its schema to either local filesystem or to S3.-    *-    * @param tableDataFrameWrapperForMicroBatch has the data to be written-    */+  /** Write a table and its schema to either local filesystem or to S3+   * and also to JDBC as indicated by clientConfig settings.+   *+   * @param tableDataFrameWrapperForMicroBatch has the data to be written+   */   def write(tableDataFrameWrapperForMicroBatch: DataFrameWrapperForMicroBatch): Unit = {     val tableName = tableDataFrameWrapperForMicroBatch.tableName-    log.info(s"Writing '$tableName' DataFrame as CSV to ${this.getPathToFolderWithCSV(tableDataFrameWrapperForMicroBatch)}")-    this.writeCSV(tableDataFrameWrapperForMicroBatch)-    this.writeSchema(tableDataFrameWrapperForMicroBatch)-    log.info(s"Wrote '$tableName' DataFrame as CSV complete, with columns ${tableDataFrameWrapperForMicroBatch.dataFrame.columns.toList}")++    // Process file write.+    if (clientConfig.outputSettings.saveIntoFile) {+      if (clientConfig.outputSettings.fileFormat.toLowerCase == "csv") {

I would write this out explicitly as a match statement. It's possible that the validation logic passes something that this code does not handle. Better to fail fast.

if (...) {
  clientConfig.outputSettings.fileFormat.toLowerCase match {
    case "csv" => ...
    case "parquet" => ...
    case other => throw new Exception(s"Unknown output file format $other")
  }
}
cwilliams-gw

comment created time in a month

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 object ClientConfigReader {           s"sparkTuning.maxResultSize value '$maxResultSize' must be 0 or match the this format: $validMemoryArgumentRegex"))     })   }++  /** validateSavepointsLocation+   *+   * @param clientConfig instance to validate, for performance settings+   */+  private def validateSavepointsLocation(clientConfig: ClientConfig): Unit = {+    try {+      require(clientConfig.savepointsLocation != null, "savepointsLocation section is missing in the config file")+    } catch {+      case e: IllegalArgumentException => throw MissingConfigParameterException("Config section is missing from the config file", e)+    }++    try {+      require(clientConfig.savepointsLocation.path != null, "savepointsLocation.path is blank")+    } catch {+      case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is missing, or is left blank in the config file", e)+    }++    try {+      require(clientConfig.savepointsLocation.path.endsWith("/") != true, "savepointsLocation.path has a trailing slash, remove it")+    } catch {+      case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter has an invalid value", e)+    }+  }++  /** validateJdbcConnectionRaw+   *+   * @param clientConfig instance to validate, for performance settings+   */+  private def validateJdbcConnectionRaw(clientConfig: ClientConfig): Unit = {+    // If saving to JDBC then validate the jdbcConnection section.+    if (clientConfig.outputSettings.saveIntoJdbcRaw || clientConfig.outputSettings.saveIntoJdbcMerged) {+      try {+        require(clientConfig.jdbcConnectionRaw != null, "jdbcConnectionRaw section is missing in the config file")+      } catch {+        case e: IllegalArgumentException => throw MissingConfigParameterException("Config section is missing from the config file", e)+      }++      /** User name and password are no longer require to support windows authentication.

Remove this commented out code.

cwilliams-gw

comment created time in a month

PullRequestReviewEvent

Pull request review commentGuidewire/cda-client

Ps dm branch 1

 object ClientConfigReader {           s"sparkTuning.maxResultSize value '$maxResultSize' must be 0 or match the this format: $validMemoryArgumentRegex"))     })   }++  /** validateSavepointsLocation+   *+   * @param clientConfig instance to validate, for performance settings+   */+  private def validateSavepointsLocation(clientConfig: ClientConfig): Unit = {+    try {+      require(clientConfig.savepointsLocation != null, "savepointsLocation section is missing in the config file")+    } catch {+      case e: IllegalArgumentException => throw MissingConfigParameterException("Config section is missing from the config file", e)+    }++    try {+      require(clientConfig.savepointsLocation.path != null, "savepointsLocation.path is blank")+    } catch {+      case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is missing, or is left blank in the config file", e)+    }++    try {+      require(clientConfig.savepointsLocation.path.endsWith("/") != true, "savepointsLocation.path has a trailing slash, remove it")+    } catch {+      case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter has an invalid value", e)+    }+  }++  /** validateJdbcConnectionRaw+   *+   * @param clientConfig instance to validate, for performance settings+   */+  private def validateJdbcConnectionRaw(clientConfig: ClientConfig): Unit = {+    // If saving to JDBC then validate the jdbcConnection section.+    if (clientConfig.outputSettings.saveIntoJdbcRaw || clientConfig.outputSettings.saveIntoJdbcMerged) {+      try {+        require(clientConfig.jdbcConnectionRaw != null, "jdbcConnectionRaw section is missing in the config file")+      } catch {+        case e: IllegalArgumentException => throw MissingConfigParameterException("Config section is missing from the config file", e)+      }++      /** User name and password are no longer require to support windows authentication.+       * try {+       * require(clientConfig.jdbcConnectionRaw.jdbcUsername != null, "jdbcConnectionRaw.jdbcUsername is blank")+       * } catch {+       * case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is missing, or is left blank in the config file", e)+       * }+       *+       * try {+       * require(clientConfig.jdbcConnectionRaw.jdbcPassword != null, "jdbcConnectionRaw.jdbcPassword is blank")+       * } catch {+       * case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is missing, or is left blank in the config file", e)+       * }+       */+      try {+        require(clientConfig.jdbcConnectionRaw.jdbcUrl != null, "jdbcConnectionRaw.jdbcUrl is blank")+      } catch {+        case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is missing, or is left blank in the config file", e)+      }++      try {+        require(clientConfig.jdbcConnectionRaw.jdbcSchema != null, "jdbcConnectionRaw.jdbcSchema is blank")+      } catch {+        case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is missing, or is left blank in the config file", e)+      }++      try {+        require(clientConfig.jdbcConnectionRaw.jdbcSaveMode != null, "jdbcConnectionRaw.jdbcSaveMode is blank")+      } catch {+        case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is missing, or is left blank in the config file", e)+      }+      val validOptions = List("overwrite", "append")+      try {+        require(validOptions.contains(clientConfig.jdbcConnectionRaw.jdbcSaveMode.toLowerCase()),+          "jdbcConnection.jdbcSaveMode is is not valid.  Valid options are 'overwrite' or 'append'.")+      } catch {+        case e: IllegalArgumentException => throw InvalidConfigParameterException("Config parameter is invalid in the config file", e)+      }++    }+  }++  /** validateJdbcConnectionMerged+   *+   * @param clientConfig instance to validate, for performance settings+   */+  private def validateJdbcConnectionMerged(clientConfig: ClientConfig): Unit = {+    // If saving to JDBC then validate the jdbcConnection section.+    if (clientConfig.outputSettings.saveIntoJdbcMerged || clientConfig.outputSettings.saveIntoJdbcMerged) {+      try {+        require(clientConfig.jdbcConnectionMerged != null, "validateJdbcConnectionMerged section is missing in the config file")+      } catch {+        case e: IllegalArgumentException => throw MissingConfigParameterException("Config section is missing from the config file", e)+      }++      /** User name and password are no longer require to support windows authentication.

Remove commented out code block

cwilliams-gw

comment created time in a month

PullRequestReviewEvent
more