Example - Splitting nested events into multiple events

Many modern databases allow storing nested or JSON documents which you may want to normalize/split into different events before loading into your data warehouse.

A prominent example of such databases is MongoDB. In this article, we will see how to split nested documents being ingested from a MongoDB collection into multiple events.

Let’s take the following input event ingested from a MongoDB collection named restaurants as an example:

{
    "_id": "5a9909384309cf90b5739342",
    "name": "Mangal Kebab Turkish Restaurant",
    "restaurant_id": "41009112",
    "borough": "Queens",
    "cuisine": "Turkish",
    "address": {
        "building": "4620",
        "coord": {
            "0": -73.9180155,
            "1": 40.7427742
        },
        "street": "Queens Boulevard",
        "zipcode": "11104"
    },
    "grades": {
        "0": {
            "date": 1414540800000,
            "grade": "A",
            "score": 12
        },
        "1": {
            "date": 1397692800000,
            "grade": "A",
            "score": 10
        },
        "2": {
            "date": 1381276800000,
            "grade": "A",
            "score": 12
        }
    }
}

The following Transformations code snippet will split the above event into three events restaurants, restaurant_address, and retaurant_grades:

def transform(event):
    events = [event]
    
    # Get event name from the event #
    eventName = event.getEventName()

    # Get properties from the event #
    properties = event.getProperties()

    # Split events
    
    # Create address
    address = properties.get('address')
    address['ref_id'] = properties.get('restaurant_id')
    events.append(Event('restaurant_address', address))
    del properties['address']
    
    #Create grades
    grades_list = properties.get('grades')
    for i in grades_list:
        grades = grades_list.get(i)
        grades['ref_id'] = properties.get('restaurant_id')
        grades['index'] = i
        events.append(Event('restaurant_grades', grades))
    del properties['grades']

    return events

The output from the above snippet will be:

[
	{
		"event_name": "demo.restaurants",
	    	"properties": {
		    "restaurant_id": "41009112",
			"name": "Mangal Kebab Turkish Restaurant",
			"cuisine": "Turkish",
			"_id": "5a9909384309cf90b5739342",
			"borough": "Queens"
		}
	},
	{
		"event_name": "restaurant_address",
		"properties": {
			"zipcode": "11104",
			"ref_id": "41009112",
			"coord": {
				"0": -73.9180155,
				"1": 40.7427742
			},
			"street": "Queens Boulevard",
			"building": "4620"
		}
	},
	{
		"event_name": "restaurant_grades",
		"properties": {
			"date": 1414540800000,
			"ref_id": "41009112",
			"score": 12,
			"grade": "A",
			"index": "0"
		}
	},
	{
		"event_name": "restaurant_grades",
		"properties": {
			"date": 1397692800000,
			"ref_id": "41009112",
			"score": 10,
			"grade": "A",
			"index": "1"
		}
	},
	{
		"event_name": "restaurant_grades",
		"properties": {
			"date": 1381276800000,
			"ref_id": "41009112",
			"score": 12,
			"grade": "A",
			"index": "2"
		}
	}
]

As you can see above, one restaurant_address event was created from the parent event and three restaurant_grades events were created as the parent event had an array of three such nested objects.