from
pyspark.sql
import
SparkSession
import
pyspark.sql.functions as F
import
pyspark.sql.types as T
if
__name__
=
=
"__main__"
:
spark
=
SparkSession.builder.appName('Parse a\
column of json strings').getOrCreate()
df
=
spark.createDataFrame(
[
[
"1"
,
"{'color': 'red', 'value': '#f00'}"
],
[
"2"
,
"{'color': 'green', 'value': '#0f0'}"
],
[
"3"
,
"{'color': 'blue', 'value': '#00f'}"
],
[
"4"
,
"{'color': 'cyan', 'value': '#0ff'}"
],
[
"5"
,
"{'color': 'magenta', 'value': '#f0f'}"
],
[
"6"
,
"{'color': 'yellow', 'value': '#ff0'}"
],
[
"7"
,
"{'color': 'black', 'value': '#000'}"
],
]
).toDF(
'id'
,
'colors'
)
df.show(truncate
=
False
)
df.printSchema()
df
=
df.withColumn(
"colors"
,
F.from_json(df.colors,
T.MapType(T.StringType(),
T.StringType())))
df.show(truncate
=
False
)
df.printSchema()
df
=
df.withColumn(
"colors"
, F.to_json(df.colors))
df.show(truncate
=
False
)
df.printSchema()
df
=
df.select(
'id'
, F.json_tuple(F.col(
"colors"
),
"color"
,
"value"
)
).toDF(
'id'
,
'color'
,
'value'
)
df.show(truncate
=
False
)
df.printSchema()