Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HMAC support #20

Merged
merged 5 commits into from
Nov 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ Embulk filter plugin to convert an input to a hash value.

- **columns**: Columns to hash (array, required)
- **name**: Name of input column (string, required)
- **algorithm**: A hash algorithm. [See also](#hash_algorithm) (string, default:`"SHA-256"`)
- **new_name**: New column name if you want to rename (string, default: `null`)
- **algorithm**: Hash algorithm. [See also](#hash_algorithm) (string, default:`"SHA-256"`)
- **secret_key**: Secret key for HMAC hashing. (string, required when specifying HMAC algorithm)
- **new_name**: New column name if you want to rename the column (string, default: `null`)

## Example

Expand All @@ -23,22 +24,22 @@ filters:
columns:
- { name: username }
- { name: email, algorithm: SHA-512, new_name: hashed_email }
- { name: phone_number, algorithm: HmacSHA256, secret_key: passw0rd }
```

## Hash Algorithm
<a name ="hash_algorithm">

This plugin uses [MessageDigest](https://docs.oracle.com/javase/7/docs/api/java/security/MessageDigest.html) for hashing.
Every implementation of the Java platform supports the following MessageDigest algorithms:
- MD5
- SHA-1
- SHA-256

You can choose either of [MessageDigest](https://docs.oracle.com/javase/8/docs/api/java/security/MessageDigest.html) algorithm or [HMAC](https://docs.oracle.com/javase/8/docs/api/javax/crypto/Mac.html) algorithm.
If you want to know all algorithms that your platform supports, run the following snippet.

```java
for (String algorithm : java.security.Security.getAlgorithms("MessageDigest")) {
System.out.println(algorithm);
}
for (String algorithm : java.security.Security.getAlgorithms("Mac")) {
System.out.println(algorithm);
}
```

## Build
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version=0.4.1-SNAPSHOT
version=0.5.0-SNAPSHOT
73 changes: 63 additions & 10 deletions src/main/kotlin/org/embulk/filter/hash/HashFilterPlugin.kt
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
package org.embulk.filter.hash

import com.google.common.base.Optional
import org.embulk.config.Config
import org.embulk.config.ConfigDefault
import org.embulk.config.ConfigSource
import org.embulk.config.Task
import org.embulk.config.TaskSource
import org.embulk.config.*
import org.embulk.spi.Column
import org.embulk.spi.DataException
import org.embulk.spi.Exec
Expand All @@ -17,6 +13,9 @@ import org.embulk.spi.PageReader
import org.embulk.spi.Schema
import org.embulk.spi.type.Types
import java.security.MessageDigest
import java.util.Locale
import javax.crypto.Mac
import javax.crypto.spec.SecretKeySpec

class HashFilterPlugin : FilterPlugin {
interface PluginTask : Task {
Expand All @@ -32,6 +31,10 @@ class HashFilterPlugin : FilterPlugin {
@get:ConfigDefault("\"SHA-256\"")
val algorithm: Optional<String>

@get:Config("secret_key")
@get:ConfigDefault("null")
val secretKey: Optional<String>

@get:Config("new_name")
@get:ConfigDefault("null")
val newName: Optional<String>
Expand All @@ -45,6 +48,8 @@ class HashFilterPlugin : FilterPlugin {
inputSchema.columns.forEach { column ->
val hashColumn = hashColumnMap[column.name]
if (hashColumn != null) {
// Check algorithm is valid
getAlgorithmType(hashColumn.algorithm.get()).validate(hashColumn)
builder.add(hashColumn.newName.or(column.name), Types.STRING)
} else {
builder.add(column.name, column.type)
Expand Down Expand Up @@ -114,18 +119,16 @@ class HashFilterPlugin : FilterPlugin {
hashColumnMap[inputColumn.name]?.let { hashColumn ->
// Write hashed value if it's hash column.
val outputColumn = outputColumnMap[hashColumn.newName.or(inputColumn.name)]
val hashedValue = generateHash(inputValue.toString(), hashColumn.algorithm.get())
val hashedValue = generateHash(inputValue.toString(), hashColumn)
builder.setString(outputColumn, hashedValue)
} ?: run {
// Write the original data
setter(inputColumn, inputValue)
}
}

private fun generateHash(value: String, algorithm: String): String {
val md = MessageDigest.getInstance(algorithm)
md.update(value.toByteArray())
return md.digest().joinToString("") { "%02x".format(it) }
private fun generateHash(value: String, config: HashColumn): String {
return getAlgorithmType(config.algorithm.get()).generateHash(value, config)
}

override fun finish() {
Expand All @@ -145,4 +148,54 @@ class HashFilterPlugin : FilterPlugin {
private fun convertColumnListToMap(columns: List<Column>?): Map<String, Column> {
return columns!!.associate { Pair(it.name, it) }
}

private fun getAlgorithmType(algorithm: String): AlgorithmType {
return when {
MD_ALGORITHMS.contains(algorithm.toUpperCase(Locale.ENGLISH)) -> {
AlgorithmType.MESSAGE_DIGEST
}
MAC_ALGORITHMS.contains(algorithm.toUpperCase(Locale.ENGLISH)) -> {
AlgorithmType.MAC
}
else -> throw ConfigException("No such algorithm: $algorithm")
}
}

enum class AlgorithmType {
MESSAGE_DIGEST {
override fun validate(config: HashColumn) {}

override fun generateHash(value: String, config: HashColumn): String {
val algorithm = config.algorithm.get()
return MessageDigest.getInstance(algorithm).run {
update(value.toByteArray())
digest().joinToString("") { "%02x".format(it) }
}
}
},
MAC {
override fun validate(config: HashColumn) {
if (!config.secretKey.isPresent) {
throw ConfigException("Secret key must not be null.")
}
}

override fun generateHash(value: String, config: HashColumn): String {
val secretKey = config.secretKey.get()
val algorithm = config.algorithm.get()
return Mac.getInstance(algorithm).run {
init(SecretKeySpec(secretKey.toByteArray(), algorithm))
doFinal(value.toByteArray()).joinToString("") { "%02x".format(it) }
}
}
};

abstract fun validate(config: HashColumn)
abstract fun generateHash(value: String, config: HashColumn): String
}

companion object {
val MD_ALGORITHMS = java.security.Security.getAlgorithms("MessageDigest") ?: emptySet<String>()
val MAC_ALGORITHMS = java.security.Security.getAlgorithms("Mac") ?: emptySet<String>()
}
}
72 changes: 69 additions & 3 deletions src/test/kotlin/org/embulk/filter/hash/TestHashFilterPlugin.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.embulk.filter.hash

import org.embulk.config.ConfigException
import org.embulk.exec.PartialExecutionException
import org.embulk.test.EmbulkPluginTest
import org.junit.Test

Expand All @@ -9,14 +11,19 @@ import org.embulk.test.TestOutputPlugin.Matcher.assertSchema
import org.embulk.test.record
import org.embulk.test.registerPlugins
import org.embulk.test.set
import org.hamcrest.Matchers.`is`
import org.hamcrest.Matchers.instanceOf
import org.junit.Assert.assertThat
import org.junit.Assert.fail
import org.junit.Before

class TestHashFilterPlugin : EmbulkPluginTest() {
@Before fun setup() {
builder.registerPlugins(HashFilterPlugin::class)
}

@Test fun specifiedColumnIsHashedAndRenamed() {
@Test
fun specifiedColumnIsHashedAndRenamed() {
val config = config().set(
"type" to "hash",
"columns" to listOf(config().set(
Expand All @@ -37,7 +44,8 @@ class TestHashFilterPlugin : EmbulkPluginTest() {
)
}

@Test fun allColumnTypesAreHashed() {
@Test
fun allColumnTypesAreHashed() {
val config = config().set(
"type" to "hash",
"columns" to listOf(
Expand Down Expand Up @@ -71,7 +79,65 @@ class TestHashFilterPlugin : EmbulkPluginTest() {
)
}

@Test fun columnIsNull() {
@Test
fun specifiedColumnIsHashedByMac() {
val config = config().set(
"type" to "hash",
"columns" to listOf(config().set(
"name" to "age",
"algorithm" to "HmacSHA256",
"secret_key" to "passw0rd",
"new_name" to "hashed_age"
)))

runFilter(config, inConfigPath = "yaml/input_basic.yml")

assertSchema(
"username" to STRING,
"hashed_age" to STRING
)

assertRecords(
record("user1", "5f9959eac71ad30782ebf4d3c98d12a4c33eadee156a6c5d3881204030811989")
)
}

@Test
fun exceptionThrownWithInvalidAlgorithm() {
try {
val config = config().set(
"type" to "hash",
"columns" to listOf(config().set(
"name" to "age",
"algorithm" to "Foo"
)))
runFilter(config, inConfigPath = "yaml/input_basic.yml")
fail("No exception")
} catch (e: PartialExecutionException) {
assertThat(e.cause, instanceOf(ConfigException::class.java))
assertThat(e.cause?.message, `is`("No such algorithm: Foo"))
}
}

@Test
fun exceptionThrownWithMacAndNoSecretKey() {
try {
val config = config().set(
"type" to "hash",
"columns" to listOf(config().set(
"name" to "age",
"algorithm" to "HmacSHA256"
)))
runFilter(config, inConfigPath = "yaml/input_basic.yml")
fail("No exception")
} catch (e: PartialExecutionException) {
assertThat(e.cause, instanceOf(ConfigException::class.java))
assertThat(e.cause?.message, `is`("Secret key must not be null."))
}
}

@Test
fun columnIsNull() {
val config = config().set(
"type" to "hash",
"columns" to listOf(
Expand Down