[SPARK-18958][SPARKR] R API toJSON on DataFrame

## What changes were proposed in this pull request? It would make it easier to integrate with other component expecting row-based JSON format. This replaces the non-public toJSON RDD API. ## How was this patch tested? manual, unit tests Author: Felix Cheung <[email protected]> Closes #16368 from felixcheung/rJSON.
apache · 17579bda3c114022a0b3889aa4c9188307af75e9 · felixcheung committed with Felix Cheung Dec 23, 2016 · Dec 23, 2016 · 17579bd
1 parent f252cb5
commit 17579bda3c114022a0b3889aa4c9188307af75e9
Unified Split

Showing with 26 additions and 18 deletions.

+1 −0 R/pkg/NAMESPACE

+18 −12 R/pkg/R/DataFrame.R

+7 −6 R/pkg/inst/tests/testthat/test_sparkSQL.R
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -133,6 +133,7 @@ exportMethods("arrange",
               "summarize",
               "summary",
               "take",
+              "toJSON",
               "transform",
               "union",
               "unionAll",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -737,26 +737,32 @@ setMethod("repartition",
 
 #' toJSON
 #'
-#' Convert the rows of a SparkDataFrame into JSON objects and return an RDD where
-#' each element contains a JSON string.
+#' Converts a SparkDataFrame into a SparkDataFrame of JSON string.
 #'
-#' @param x A SparkDataFrame
-#' @return A StringRRDD of JSON objects
+#' Each row is turned into a JSON document with columns as different fields.
+#' The returned SparkDataFrame has a single character column with the name \code{value}
+#'
+#' @param x a SparkDataFrame
+#' @return a SparkDataFrame
+#' @family SparkDataFrame functions
+#' @rdname toJSON
+#' @name toJSON
 #' @aliases toJSON,SparkDataFrame-method
-#' @noRd
+#' @export
 #' @examples
 #'\dontrun{
 #' sparkR.session()
-#' path <- "path/to/file.json"
-#' df <- read.json(path)
-#' newRDD <- toJSON(df)
+#' path <- "path/to/file.parquet"
+#' df <- read.parquet(path)
+#' df_json <- toJSON(df)
 #'}
+#' @note toJSON since 2.2.0
 setMethod("toJSON",
           signature(x = "SparkDataFrame"),
           function(x) {
-            rdd <- callJMethod(x@sdf, "toJSON")
-            jrdd <- callJMethod(rdd, "toJavaRDD")
-            RDD(jrdd, serializedMode = "string")
+            jsonDS <- callJMethod(x@sdf, "toJSON")
+            df <- callJMethod(jsonDS, "toDF")
+            dataFrame(df)
           })
 
 #' Save the contents of SparkDataFrame as a JSON file
@@ -936,7 +942,7 @@ setMethod("unique",
 
 #' Sample
 #'
-#' Return a sampled subset of this SparkDataFrame using a random seed. 
+#' Return a sampled subset of this SparkDataFrame using a random seed.
 #' Note: this is not guaranteed to provide exactly the fraction specified
 #' of the total count of of the given SparkDataFrame.
 #'
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1689,12 +1689,13 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   unlink(jsonPath3)
 })
 
-test_that("toJSON() returns an RDD of the correct values", {
-  df <- read.json(jsonPath)
-  testRDD <- toJSON(df)
-  expect_is(testRDD, "RDD")
-  expect_equal(getSerializedMode(testRDD), "string")
-  expect_equal(collectRDD(testRDD)[[1]], mockLines[1])
+test_that("toJSON() on DataFrame", {
+  df <- as.DataFrame(cars)
+  df_json <- toJSON(df)
+  expect_is(df_json, "SparkDataFrame")
+  expect_equal(colnames(df_json), c("value"))
+  expect_equal(head(df_json, 1),
+              data.frame(value = "{\"speed\":4.0,\"dist\":2.0}", stringsAsFactors = FALSE))
 })
 
 test_that("showDF()", {