Я пытаюсь получить данные из файла json.
df=spark.read.json('/home/data/activities.json',multiLine=True)
Содержимое выглядит следующим образом (я включил только 1 строку с данными, их 94):
{"meta.count":"94",
"data":[
{"id":"f67de4f6-d23e-49a7-b9dd-63d68df533a3",
"name.fi":"Purjehdus 1700-luvun tyyliin tykkisluuppi Dianalla","name.en":"Cannon sloop Diana\u00B4s sailings","name.sv":"Seglingar med kanonslupen Diana",
"name.zh":null,
"source_type.id":3,
"source_type.name":"MyHelsinki",
"info_url":"https:\/\/www.suomenlinnatours.com\/en\/cannon-sloop-diana?johku_product=7",
"modified_at":"2019-12-28T15:10:33.145Z",
"location.lat":60.145111083984375,
"location.lon":24.987560272216797,
"location.address.street_address":"Suomenlinna Tykist\u00F6lahti Pier",
"location.address.postal_code":null,
"location.address.locality":"Helsinki",
"description.intro":null,
"description.body":"<p>Luvassa on amiraali Chapamin tahdittama mielenkiintoinen laivamatka valistusajan Viaporiin... /p>\r\n",
"description.images":[
{"url":"https:\/\/edit.myhelsinki.fi\/sites\/default\/files\/2017-06\/Tykkisluuppi_Diana.jpg",
"copyright_holder":"",
"license_type.id":1,
"license_type.name":"All rights reserved."},
{"url":"https:\/\/edit.myhelsinki.fi\/sites\/default\/files\/2017-06\/Tykkisluuppi_Diana_2.jpg",
"copyright_holder":"",
"license_type.id":1,
"license_type.name":"All rights reserved."},
{"url":"https:\/\/edit.myhelsinki.fi\/sites\/default\/files\/2017-06\/Tykkisluuppi_Diana_3.jpg",
"copyright_holder":"",
"license_type.id":1,"license_type.name":"All rights reserved."}],
"tags":[
{"id":"myhelsinki:45","name":"sea"},
{"id":"myhelsinki:836","name":"suomenlinna"},
{"id":"myhelsinki:793","name":"history"}],
"where_when_duration.where_and_when":"Suomenlinna, kes\u00E4kuusta elokuuhun",
"where_when_duration.duration":"N. 1h 45min"}],
"tags":[
{"id":"myhelsinki:453","name":"nature"},
{"id":"myhelsinki:747","name":"canoeing"},
{"id":"myhelsinki:342","name":"guidance"},
{"id":"myhelsinki:399","name":"outdoor recreation"}],
"where_when_duration.where_and_when":"Toukokuussa joka sunnuntai, kes\u00E4-elokuussa joka keskiviikko ja sunnuntai, syyskuussa joka sunnuntai",
"where_when_duration.duration":"4,5 tuntia sis\u00E4lt\u00E4en kuljetuksen"}],
"tags.myhelsinki:10":"sauna",
"tags.myhelsinki:1016":"heavy rock",
"tags.myhelsinki:1749":"national parks",
"tags.myhelsinki:1822":"schools (educational institutions)",
"tags.myhelsinki:2":"food"
}
Схема выглядит следующим образом:
root
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- description.body: string (nullable = true)
| | |-- description.images: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- copyright_holder: string (nullable = true)
| | | | |-- license_type.id: long (nullable = true)
| | | | |-- license_type.name: string (nullable = true)
| | | | |-- url: string (nullable = true)
| | |-- description.intro: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- info_url: string (nullable = true)
| | |-- location.address.locality: string (nullable = true)
| | |-- location.address.postal_code: string (nullable = true)
| | |-- location.address.street_address: string (nullable = true)
| | |-- location.lat: double (nullable = true)
| | |-- location.lon: double (nullable = true)
| | |-- modified_at: string (nullable = true)
| | |-- name.en: string (nullable = true)
| | |-- name.fi: string (nullable = true)
| | |-- name.sv: string (nullable = true)
| | |-- name.zh: string (nullable = true)
| | |-- source_type.id: long (nullable = true)
| | |-- source_type.name: string (nullable = true)
| | |-- tags: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: string (nullable = true)
| | | | |-- name: string (nullable = true)
| | |-- where_when_duration.duration: string (nullable = true)
| | |-- where_when_duration.where_and_when: string (nullable = true)
|-- meta.count: string (nullable = true)
|-- tags.myhelsinki:10: string (nullable = true)
|-- tags.myhelsinki:1016: string (nullable = true)
|-- tags.myhelsinki:1749: string (nullable = true)
|-- tags.myhelsinki:1822: string (nullable = true)
|-- tags.myhelsinki:2: string (nullable = true)
Меня интересует массив "данных", в том числе "теги" вложенного массива. Я бы хотел пропустить "meta.count" и "tags.myhelsinki: ..."
Я пробовал это:
df.withColumn("expl_data", explode_outer(col("tags"))).select("expl_data.data.name.en").show(10)
и я получаю сообщение об ошибке:
AnalysisException: "cannot resolve '`tags`' given input columns: [tags.myhelsinki:10, tags.myhelsinki:453, tags.myhelsinki:226, tags.myhelsinki:1016, tags.myhelsinki:342, tags.myhelsinki:531, tags.myhelsinki:364, tags.myhelsinki:836, tags.myhelsinki:346,...
У меня такая же ошибка, когда я пытаюсь взорвать массивы «tags.name» или «description.images». Может ли кто-нибудь помочь? Моя цель - получить набор выбранных полей из этой структуры (теги очень важны).
Спасибо заранее! Алисия