{"id":29384,"date":"2020-02-20T23:43:47","date_gmt":"2020-02-20T23:43:47","guid":{"rendered":"http:\/\/streamhub.co.uk\/?p=29384"},"modified":"2021-06-12T09:21:07","modified_gmt":"2021-06-12T09:21:07","slug":"how-to-design-a-modern-data-analytics-platform","status":"publish","type":"post","link":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/","title":{"rendered":"How to design a modern data analytics platform in a rapidly changing world"},"content":{"rendered":"<span class=\"span-reading-time rt-reading-time\" style=\"display: block;\"><span class=\"rt-label rt-prefix\">Reading Time: <\/span> <span class=\"rt-time\"> 11<\/span> <span class=\"rt-label rt-postfix\">minutes<\/span><\/span><p>[et_pb_section fb_built=&#8221;1&#8243; _builder_version=&#8221;4.0.6&#8243; custom_padding=&#8221;1px||0px|||&#8221; da_is_popup=&#8221;off&#8221; da_exit_intent=&#8221;off&#8221; da_has_close=&#8221;on&#8221; da_alt_close=&#8221;off&#8221; da_dark_close=&#8221;off&#8221; da_not_modal=&#8221;on&#8221; da_is_singular=&#8221;off&#8221; da_with_loader=&#8221;off&#8221; da_has_shadow=&#8221;on&#8221; da_disable_devices=&#8221;off|off|off&#8221;][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_margin=&#8221;13px|auto||auto||&#8221; custom_padding=&#8221;0px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_blurb title=&#8221;@ET-DC@eyJkeW5hbWljIjp0cnVlLCJjb250ZW50IjoicG9zdF9kYXRlIiwic2V0dGluZ3MiOnsiYmVmb3JlIjoiIiwiYWZ0ZXIiOiIiLCJkYXRlX2Zvcm1hdCI6ImRlZmF1bHQiLCJjdXN0b21fZGF0ZV9mb3JtYXQiOiIifX0=@&#8221; use_icon=&#8221;on&#8221; font_icon=&#8221;%%128%%&#8221; icon_color=&#8221;#00ac69&#8243; icon_placement=&#8221;left&#8221; content_max_width=&#8221;1100px&#8221; use_icon_font_size=&#8221;on&#8221; icon_font_size=&#8221;16px&#8221; _builder_version=&#8221;4.0.6&#8243; _dynamic_attributes=&#8221;title&#8221; header_font=&#8221;Work Sans|600|||||||&#8221; header_text_align=&#8221;left&#8221; header_font_size=&#8221;14px&#8221; text_orientation=&#8221;center&#8221; custom_margin=&#8221;-7px||0px|||&#8221; animation=&#8221;off&#8221; hover_enabled=&#8221;1&#8243; locked=&#8221;off&#8221; icon_color__hover=&#8221;#00ac69&#8243; box_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.3)&#8221; box_shadow_color__hover=&#8221;rgba(0,0,0,0.3)&#8221; box_shadow_style__hover=&#8221;none&#8221; box_shadow_style__hover_enabled=&#8221;none&#8221; use_background_color_gradient__hover=&#8221;off&#8221; use_background_color_gradient__hover_enabled=&#8221;off&#8221; background_color_gradient_start__hover=&#8221;#2b87da&#8221; background_color_gradient_start__hover_enabled=&#8221;#2b87da&#8221; background_color_gradient_end__hover=&#8221;#29c4a9&#8243; background_color_gradient_end__hover_enabled=&#8221;#29c4a9&#8243; background_color_gradient_type__hover=&#8221;linear&#8221; background_color_gradient_type__hover_enabled=&#8221;linear&#8221; background_color_gradient_direction__hover=&#8221;180deg&#8221; background_color_gradient_direction__hover_enabled=&#8221;180deg&#8221; background_color_gradient_direction_radial__hover=&#8221;center&#8221; background_color_gradient_direction_radial__hover_enabled=&#8221;center&#8221; background_color_gradient_start_position__hover=&#8221;0%&#8221; background_color_gradient_start_position__hover_enabled=&#8221;0%&#8221; background_color_gradient_end_position__hover=&#8221;100%&#8221; background_color_gradient_end_position__hover_enabled=&#8221;100%&#8221; background_color_gradient_overlays_image__hover=&#8221;off&#8221; background_color_gradient_overlays_image__hover_enabled=&#8221;off&#8221; parallax__hover=&#8221;off&#8221; parallax__hover_enabled=&#8221;off&#8221; parallax_method__hover=&#8221;on&#8221; parallax_method__hover_enabled=&#8221;on&#8221; background_size__hover=&#8221;cover&#8221; background_size__hover_enabled=&#8221;cover&#8221; background_position__hover=&#8221;center&#8221; background_position__hover_enabled=&#8221;center&#8221; background_repeat__hover=&#8221;no-repeat&#8221; background_repeat__hover_enabled=&#8221;no-repeat&#8221; background_blend__hover=&#8221;normal&#8221; background_blend__hover_enabled=&#8221;normal&#8221; allow_player_pause__hover=&#8221;off&#8221; allow_player_pause__hover_enabled=&#8221;off&#8221; background_video_pause_outside_viewport__hover=&#8221;on&#8221; background_video_pause_outside_viewport__hover_enabled=&#8221;on&#8221; body_letter_spacing__hover=&#8221;0px&#8221; body_letter_spacing__hover_enabled=&#8221;0px&#8221; body_text_shadow_style__hover=&#8221;none&#8221; body_text_shadow_style__hover_enabled=&#8221;none&#8221; body_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; body_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;][\/et_pb_blurb][\/et_pb_column][\/et_pb_row][\/et_pb_section][et_pb_section fb_built=&#8221;1&#8243; admin_label=&#8221;Services&#8221; _builder_version=&#8221;4.0.6&#8243; max_width=&#8221;90%&#8221; custom_padding=&#8221;0px||0px|||&#8221; locked=&#8221;off&#8221; da_is_popup=&#8221;off&#8221; da_exit_intent=&#8221;off&#8221; da_has_close=&#8221;on&#8221; da_alt_close=&#8221;off&#8221; da_dark_close=&#8221;off&#8221; da_not_modal=&#8221;on&#8221; da_is_singular=&#8221;off&#8221; da_with_loader=&#8221;off&#8221; da_has_shadow=&#8221;on&#8221; da_disable_devices=&#8221;off|off|off&#8221;][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_padding=&#8221;0px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;3px|||||&#8221;]<\/p>\n<h1 name=\"4dda\" class=\"graf graf--h3\">Designing a successful data analytics platform for a rapidly changing world<\/h1>\n<h4 name=\"5fe8\" class=\"graf graf--h4\">Key design principles we learned while building Streamhub\u2019s video data analytics platform<\/h4>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243; min_height=&#8221;58px&#8221; custom_padding=&#8221;4px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_divider color=&#8221;#003387&#8243; divider_weight=&#8221;5px&#8221; _builder_version=&#8221;4.0.6&#8243;][\/et_pb_divider][\/et_pb_column][\/et_pb_row][et_pb_row column_structure=&#8221;1_2,1_2&#8243; _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;-28px|auto||auto||&#8221;][et_pb_column type=&#8221;1_2&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;|-24px||||&#8221; custom_padding=&#8221;||11px|||&#8221;]<\/p>\n<p><span style=\"font-weight: 400;\">We are in the age of data where you can change the ending of a film if it doesn\u2019t match your taste! <\/span><a href=\"http:\/\/www.digitalspy.com\/black-mirror\/\"><span style=\"font-weight: 400;\">Black Mirror<\/span><\/a><span style=\"font-weight: 400;\">&#8216;s &#8216;Bandersnatch&#8217; is Netflix&#8217;s first major interactive film where its script has millions of permutations. <\/span><\/p>\n<p><span style=\"font-weight: 400;\">\u201c<\/span><i><span style=\"font-weight: 400;\">It\u2019s an ambitious film that saw Netflix making several innovations in its platform, such as reworking its cache memory to store multiple scenes or creating the \u2018Branch Manager\u2019 to help Charlie Brooker write the script<\/span><\/i><span style=\"font-weight: 400;\">.\u201d <\/span><\/p>\n<p><span style=\"font-weight: 400;\">Another significant milestone for data-driven content creation, following on from the story behind the blockbuster<\/span><a href=\"https:\/\/www.theguardian.com\/media-network\/media-network-blog\/2015\/feb\/27\/house-cards-netflix-internet-video-kings\"><span style=\"font-weight: 400;\"> Netflix remake of \u2018House of Cards\u2019.\u00a0<\/span><\/a><\/p>\n<p><span style=\"font-weight: 400;\">From \u2018House of Cards\u2019 to \u2018Bandersnatch\u2019, what is that key ingredient of Netflix\u2019s success formula?<\/span><b> Innovation! <\/b><\/p>\n<p><span style=\"font-weight: 400;\">Netflix has always pushed the boundaries of disruptive innovation, revolutionising how we consume content everyday!<\/span><\/p>\n<p><span style=\"font-weight: 400;\">Simply put, innovation is the key to be ahead of the game for any company relying on data &#8211; innovating before your competitors, answering complex questions with confidence and rapidly adopting the latest powerful and sophisticated analytics tools. <\/span><\/p>\n<p><span style=\"font-weight: 400;\">If you happen to be a start-up, cost and resource limitations make it that more challenging whilst having the upside of flexibility, rapid decision-making and higher risk-taking<\/span><span style=\"font-weight: 400;\">.\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p>[\/et_pb_text][\/et_pb_column][et_pb_column type=&#8221;1_2&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Black-Mirror-Bandersnatch-2018-movie-poster.jpg&#8221; _builder_version=&#8221;4.0.6&#8243; min_height=&#8221;643px&#8221; custom_margin=&#8221;||-17px|||&#8221; custom_padding=&#8221;||0px|11px||&#8221;][\/et_pb_image][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_padding=&#8221;0px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;3px|||||&#8221;]<\/p>\n<h3><strong>What we have learned<\/strong><\/h3>\n<p><span style=\"font-weight: 400;\">At Streamhub, we have always been good at adopting the latest technologies. From RDBMS to HBase, Cassandra to Elasticsearch to Apache Spark to Redshift to Clickhouse to Apache Druid to Snowflake, we have seen it all! We have been spending a lot of time in switching between technologies to respond to changing business demands, adapting the platform for different customers, heavy integrations and so on. And as we all know, change is part of the game. For a long time however, we had little time to sink our thoughts into fundamental innovation to consider <\/span><b>how to build the foundation of our technology to embrace change more easily<\/b><span style=\"font-weight: 400;\">.\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\">There is also another side to it. The \u2018modern\u2019 wave of data analytics based on\u00a0 \u2018distributed computing\u2019 has rapidly evolved from Hadoop to Google\u2019s MapReduce followed by Apache Spark and \u2018analytics in the cloud\u2019 making it possible to analyse a large set of data in almost a couple of clicks without leaving the desk! This has made the space even more competitive and now more than ever it has become <\/span><b>vital for data businesses to focus on rapid development and innovation<\/b><span style=\"font-weight: 400;\">.\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\">This article talks about how you can design your big data system iteratively so you can continually respond to opportunities without putting too much cost from the get go and still make it flexible to adopt future business needs.\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\">We will take you through five principles which are intended to introduce modularisation in your pipeline through tiering, the range of technologies that you can exploit, dimensions based on which we can separate these technologies, yet keep the core analytics narrow and defined, and keep yourself fast paced while keeping the costs in check. <\/span><\/p>\n<p><span style=\"font-weight: 400;\">We highly recommend giving this a thought, no matter which stage your company is in. This has helped us not only set futuristic direction for the technology but also helped the product in understanding the reach of data, identifying low hanging features and getting a hint of time-consuming future features so we could plan; keeping development, product and sales strategy in sync. This article is meant for data technologists, architects and early data technology adopters or data-related business people.<\/span><\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_margin=&#8221;-15px|auto||auto||&#8221; custom_padding=&#8221;0px|||||&#8221; min_height=&#8221;181px&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; min_height=&#8221;126px&#8221; custom_margin=&#8221;30px|||||&#8221;]<\/p>\n<h3><span style=\"font-weight: 400;\"><\/span><\/h3>\n<h3><strong>1. As veterans of data say \u201cdesign in tiers\u201d, but also do think between the tiers.<\/strong><\/h3>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p><span style=\"font-weight: 400;\">Data tiering is about moving data across a pool of storage to optimise cost and performance, which traditionally is between \u2018hot\u2019 or \u2018warm\u2019 servers or \u2018archived\u2019 storage. With more technologies involved in modern systems, this extends to cross technologies and cross cloud platforms. Data usually progresses through four broad tiers: In-transit, raw, cleansed &amp; optimized, specialized-persistence. These tiers might break up further based on your business case. Our our reference, let\u2019s take this example:<\/span><\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_margin=&#8221;-15px|auto||auto||&#8221; custom_padding=&#8221;0px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Image1.png&#8221; _builder_version=&#8221;4.0.6&#8243; width=&#8221;95%&#8221; custom_padding=&#8221;|220px||||&#8221;][\/et_pb_image][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_padding=&#8221;0px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;-17px|||||&#8221; custom_padding=&#8221;0px|||||&#8221;]<\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p><span style=\"font-weight: 400;\">Tiering helps you make your analytics system modular, which provides flexibility so that your organization can meet its ever-changing business requirements. Tiers could be based on:<\/span><\/p>\n<ul>\n<li><b>Capacity\/Archive:<\/b><span style=\"font-weight: 400;\"> To ensure that you are using the storage capacity optimally, you need to design how your data moves, based on time or access pattern, such that your storage best fits the performance requirements while keeping the cost in check. For example, move the data to AWS Glacier when it is older than 6 months or use S3 intelligent tiering to move infrequently accessed data to Glacier. <\/span><\/li>\n<li><b style=\"font-size: 16px;\">Standardization<\/b><span style=\"font-size: 16px;\">: If you are collecting data from several sources or for different customers, you may want to standardize them into a common format, so you don\u2019t have to deal with this in downstream layers. <\/span><\/li>\n<li><b style=\"font-size: 16px;\">Personally identifiable information (PII) segregation: <span style=\"font-weight: 400;\">You may want to remove personally identifiable information (PII) while data is in-transit, before it lands your storage.<\/span><\/b><\/li>\n<li><b style=\"font-size: 16px;\">Storage tiering for performance within a datastore:<\/b><span style=\"font-size: 16px;\">\u00a0<\/span>Tier your storage as \u2018hot\u2019 and \u2018warm\u2019 zones which can be based on type of disks or instances. For example, with Elasticsearch, you can set an arbitrary attribute that can tag node server as hot or warm and set routing in a way that your recent data is routed to hot nodes while the rest is stored in warm nodes.<\/li>\n<li><b style=\"font-size: 16px;\">Persist to optimal datastore for specialised use-cases.<\/b><span style=\"font-size: 16px;\"> For example<\/span><b style=\"font-size: 16px;\">,<\/b><span style=\"font-size: 16px;\"> persist your data in a column-oriented database or an OLAP datastore for deep-dive analytics or pre-aggregate and scan for quick access or run through some ML model and store in some tree database for shortest path use cases. <\/span><\/li>\n<li><b style=\"font-size: 16px;\">Storage caching to optimise performance: <\/b><span style=\"font-size: 16px;\">Technologies like Alluxio or Apache Ignite provide distributed in-memory cache which can help you run your queries, let&#8217;s say Spark queries over Alluxio, blazing fast by sharing the data cross jobs. <\/span><\/li>\n<li><b style=\"font-size: 16px;\">Optimal input format for analytics platform: <\/b><span style=\"font-size: 16px;\">For example<\/span><b style=\"font-size: 16px;\">,<\/b><span style=\"font-size: 16px;\"> you may want to store your data in columnar format like ORC + Zlib to optimise cost for your data lake solution, while in Databricks-Delta format for other ML use cases so you can exploit it\u2019s time travel feature. <\/span><\/li>\n<li><b style=\"font-size: 16px;\">Across cloud platforms: <\/b><span style=\"font-size: 16px;\">You can also move your data across the cloud to use specific services of a cloud platform or to manage cost. However, it is always more complex and expensive to move data across cloud unless there is strong motivation to do so.\u00a0<\/span><\/li>\n<\/ul>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p><span style=\"font-weight: 400;\">Since your data is moving between the tiers, you might need to access the data\u00a0 for <strong>validation or exploration<\/strong> for your business intelligence use cases. So you would need some analytic tool over these tiers, if possible on-demand, serverless and SQL-like so it is cost effective and easy to use. If you are on the cloud this tool can be AWS Athena or Google Big query or Azure data lake. I will elaborate on this later in the article.\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\">So a simple pipeline may look something like this:<\/span><\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Image2.jpg&#8221; _builder_version=&#8221;4.0.6&#8243;][\/et_pb_image][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_padding=&#8221;0px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;-23px|||||&#8221;]<\/p>\n<p><b><\/b><\/p>\n<p><b>Automated workflows and processes are necessary in order to reap the benefits of tiering. <\/b><span style=\"font-weight: 400;\">You would need to command some orchestration tools which can be a combination of services provided by your cloud platform, like AWS step functions,\u00a0 CloudWatch\/SNS; and\/or open-source options like Airflow, Luigi, Piper, etc. Automating your workflow definitely makes your life easier, at the same time, parts which are difficult to automate you can always start running manually first, optimize them and then consider automating. During the initial phase, your workflow might change many times.\u00a0<\/span><\/p>\n<p name=\"5fb3\" class=\"graf graf--p\">Also, think about the <strong class=\"markup--strong markup--p-strong\">data archiving, purging and data recovery<\/strong> process. If your pipeline breaks at any time, how would your support team be informed? How would you recover the data? Would it be manual or automatically triggered?<\/p>\n<p>[\/et_pb_text][et_pb_image _builder_version=&#8221;3.27.4&#8243;][\/et_pb_image][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_margin=&#8221;-19px|auto||auto||&#8221; custom_padding=&#8221;0px|||||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243;]<\/p>\n<h3><strong><\/strong><\/h3>\n<h3><strong>2. \u201cNo one datastore would fit for all your use-cases\u201d &#8211; get away from that monolithic thinking!<\/strong><\/h3>\n<p>&nbsp;<\/p>\n<p><span style=\"font-weight: 400;\">If you have been in an analytical business for a while, you would know that the scope, scale and depth of data requirements change very quickly. Teams may decide to move from one technology to another to cover all, and you will spend most of your time in \u2018migrations\u2019. This could be a sign of monolithic design.\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\">Let\u2019s try a dumb but simple pictorial demonstration. As you go from left to right through these tiers (let&#8217;s take the example of the pipeline above), the analytical properties each tier offers are very different. It\u2019s obvious, you would like your ideal solution to have the following properties: cost-efficient at storage \/ compute \/ operations, data queryable as soon as it lands, highly concurrent and performant even with inconsistent traffic, elastic volume &#8211; being able to query the good extent of data from the latest to archived data and handle all your specialised use-cases. <\/span><\/p>\n<p><span style=\"font-weight: 400;\">In the picture below, the intensity of properties is increasing as you go outwards, so the \u2018dashed\u2019 one is our ideal solution. The same properties for analytic solution over T0 (green), T1-2 (blue) and T3 (crimson) are marked relative to each other. You may have many \u2018specialised\u2019 use-cases &#8211; I am skipping the further break down of T3 but if you do, it can add more properties to your ideal analytical system.\u00a0<\/span><\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Image3.jpg&#8221; _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;15px|||||&#8221; custom_padding=&#8221;|3px|0px|||&#8221;][\/et_pb_image][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;3.27.4&#8243; custom_margin=&#8221;|auto|-37px|auto||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.27.4&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;20px|||||&#8221;]<\/p>\n<p><span style=\"font-weight: 400;\">As you must have noticed already, our ideal solution is a combination of all these different solutions! Instead of pushing all that pressure to one database and choking it,<\/span> <b>begin by designing your <\/b><b><i>\u201cideal\u201d<\/i><\/b><b> solution as a combination of different analytical services and datastores.\u00a0<\/b><\/p>\n<p><b><\/b><\/p>\n<ol>\n<li><span style=\"font-weight: 400;\">Identify the tiers you are exposing for analytics.<\/span><span style=\"font-weight: 400;\"><\/span><\/li>\n<li style=\"font-weight: 400;\"><span style=\"font-weight: 400;\">Identify the broad use-cases your exposed layers must cover.<\/span><\/li>\n<li style=\"font-weight: 400;\"><span style=\"font-weight: 400;\">Design for your specialized use-cases. Pick the best fit analytical platforms or datastores.\u00a0<\/span><\/li>\n<li style=\"font-weight: 400;\"><span style=\"font-weight: 400;\">Think hard about data refresh or update schedules, versioning and retention\/clean-up before you pick technology.<\/span><\/li>\n<li style=\"font-weight: 400;\"><span style=\"font-weight: 400;\">Once you have your best solution on paper, pick a subset that ticks your important features within your cost and resources your company can afford at the moment. As you grow, you might want to get closer to your ideal solution<\/span><span style=\"font-weight: 400;\">.<\/span><\/li>\n<\/ol>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/image5.png&#8221; _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;12px|||||&#8221;][\/et_pb_image][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243; min_height=&#8221;566px&#8221; custom_margin=&#8221;15px|auto||auto||&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;24px|||||&#8221;]<\/p>\n<h3><strong>3. Master a distributed processing framework for large scale data processing but do not underestimate the power of SQL-like severless engines!<\/strong><\/h3>\n<p><strong><\/strong><\/p>\n<p><span style=\"font-weight: 400;\">Hadoop MapReduce emerged as the first and fastest way to extract business value from massively large datasets through distributed processing over commodity hardware, followed by tools such as Hive, HBase, Pig, etc. Fast forward a couple of years, Apache Spark emerged claiming to be much faster than Hadoop MR, along with streaming and ML capabilities. An excerpt from Wikipedia which compares the two: \u201c<\/span><i><span style=\"font-weight: 400;\">MapReduce programs read input data from disk, <\/span><\/i><a href=\"https:\/\/en.wikipedia.org\/wiki\/Map_(parallel_pattern)\"><i><span style=\"font-weight: 400;\">map<\/span><\/i><\/a><i><span style=\"font-weight: 400;\"> a function across the data, <\/span><\/i><a href=\"https:\/\/en.wikipedia.org\/wiki\/Fold_(higher-order_function)\"><i><span style=\"font-weight: 400;\">reduce<\/span><\/i><\/a><i><span style=\"font-weight: 400;\"> the results of the map, and store reduction results on disk. Spark&#8217;s RDDs function as a <\/span><\/i><a href=\"https:\/\/en.wikipedia.org\/wiki\/Working_set\"><i><span style=\"font-weight: 400;\">working set<\/span><\/i><\/a><i><span style=\"font-weight: 400;\"> for distributed programs that offers a (deliberately) restricted form of distributed <\/span><\/i><a href=\"https:\/\/en.wikipedia.org\/wiki\/Shared_memory\"><i><span style=\"font-weight: 400;\">shared memory<\/span><\/i><\/a><i><span style=\"font-weight: 400;\">.<\/span><\/i><span style=\"font-weight: 400;\">\u201c\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\">Since Spark can also use HDFS for data storage and YARN for cluster management, it can run on AWS\u2019s Elastic MapReduce clusters and GCP\u2019s Dataproc clusters. In the data analytics world, it almost became fashionable to use Apache Spark!\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\">No doubt Spark serves a broader range of analytic tools through Streaming, MLlib and GraphX tools. But if we are considering only simple batch use-cases querying on terabytes scale, the serverless SQL-like services provided by AWS and Google are pretty powerful and come handy!<\/span><\/p>\n<p><span style=\"font-weight: 400;\">Google describes Bigquery as cloud data warehouse that supports super-fast SQL queries using the processing power of Google\u2019s infrastructure. Bigquery is<\/span><a href=\"https:\/\/cloud.google.com\/blog\/products\/gcp\/bigquery-under-the-hood\"><span style=\"font-weight: 400;\"> powered by<\/span><\/a><span style=\"font-weight: 400;\"> other Google technologies like Dremel (execution engine), Borg (compute), Colossus (distributed storage) and Jupiter (network) packaged such that it is convenient for running ad hoc queries across very large databases. While <\/span><a href=\"https:\/\/aws.amazon.com\/athena\/\"><span style=\"font-weight: 400;\">AWS describes<\/span><\/a><span style=\"font-weight: 400;\"> Athena as an interactive query service for conveniently analyzing data in S3 at low cost and without needing to set up a complex infrastructure. It is a managed PrestoDB engine. You can also run Presto on EMR but you have to manage your cluster while Athena is fully managed. For both the technologies you pay for what you use. These are meant for only batch loads, they would not be the right choice for client facing applications directly due to limited concurrency which cannot be scaled after certain limit per account.\u00a0<\/span><\/p>\n<p name=\"1c5d\" class=\"graf graf--p\">Also, there are cloud-based Datawarehouse solutions like Snowflake, Redshift, recently emerging Firebolt, that are pretty easy to work with while it gives you the power of a full-fledged data warehouse.<\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;40px|||||&#8221; custom_padding=&#8221;0px|||||&#8221;]<\/p>\n<h3><strong>4. <\/strong><strong class=\"markup--strong markup--h4-strong\">Relish technologies that allow you to separate storage and compute layers.<\/strong><\/h3>\n<p name=\"c77d\" class=\"graf graf--p\">Group your use cases based on cost, performance, and scalability requirements. Separate your [high-performance + consistent-traffic] use-cases from [on-demand + inconsistent-traffic] use-cases. This gives you the flexibility to design the two solutions independently and keep your architecture simple and yet cost-effective and scalable. For [on-demand + low-traffic] use cases, you can easily exploit analytic platforms with the following features while for [high-performance + consistent-traffic] you can further optimize.<\/p>\n<p><b>Separate your [high-performance + consistent-traffic] use-cases from [on-demand + inconsistent-traffic] use-cases.<\/b><span style=\"font-weight: 400;\"> This gives you the flexibility to design the two solutions independently and keep your architecture simple and yet cost-effective and scalable. For on-demand + low-traffic use cases, you can easily exploit analytic platforms with the following features:<\/span><\/p>\n<p><strong>Separation of storage and computing<\/strong><\/p>\n<p><span style=\"font-weight: 400;\">Historically, databases were seen as tightly coupled compute and storage components. But in the last few years \u201cseparation of storage &amp; compute\u201d has gained momentum. The ability to decouple compute and storage brings increased scalability and availability with dramatically reduced costs. This is possible because with this architecture, unlike typical databases, nodes don\u2019t \u2018own\u2019 the data, so no need to rebalance the nodes rather all the nodes see the same data on the network and adding a node adds more computing power and much faster, since there is no movement of data. Which also means you can independently and elastically scale storage and compute. There are a couple of solutions that give you this, to name a few &#8211; Snowflake, Apache Drill, Athetha, Big Query, Redshift Spectrum.\u00a0<\/span><\/p>\n<p><strong>Pay-as-you-use\u00a0<\/strong><\/p>\n<p><span style=\"font-weight: 400;\">With decoupled storage and compute, it is possible that you pay by query or by the amount of data scanned. Like, Big Query and Athena charge by per TB scanned or Snowflake which charges by computing power usage per-second. If the traffic is consistent during the day, this can highly optimise your cost.<\/span><\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243; custom_margin=&#8221;40px||40px||true|&#8221;]<\/p>\n<h3><strong>5. Flip the 80\/20 rule. Keep landing storage simple, so integrations are faster and easier to scale.<\/strong><\/h3>\n<p name=\"0cb2\" class=\"graf graf--p\">Once you have a pipeline setup and functioning smoothly, major time of your team should be spent on driving deeper insights or data integrations. And this is exactly where the data analytics team should be spending 80% of their time because this is what is going to bring revenue to your business.<\/p>\n<h3><strong><\/strong>\u00a0\u00a0<\/h3>\n<p><span style=\"font-weight: 400;\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/rule-concept-pareto-principle-business-145556108.jpg\" style=\"display: block; margin-left: auto; margin-right: auto;\" width=\"466\" height=\"232\" \/><\/span><\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p name=\"9028\" class=\"graf graf--p\">If you are integrating several data sources, it is a good idea to keep your landing storage simple, fast to ramp-up for the integration team, and something that can easily integrate with other services. Let\u2019s say\u200a\u2014\u200aif you are in the AWS ecosystem and your team is already comfortable with S3, imagine using something like Cassandra as your landing storage. Though Cassandra is also now a managed service on AWS, the technology will still be a learning curve for the integration team. Also, S3 can much more easily integrate with your microservices\u200a\u2014\u200ayou can trigger Lambda functions when let\u2019s say S3 objects arrive or trigger a workflow. Choosing a simple landing store that is handy for your team, and a fully going microservice can massively speed up integrations.<\/p>\n<p name=\"2ea2\" class=\"graf graf--p\"><strong class=\"markup--strong markup--p-strong\">Go for serverless analytical platforms or hosted solutions as much as possible<\/strong>, especially if you are a start-up. It helps you lower the learning curve and reduce the operational risks. Other benefits being faster time to market, faster to scale, and reduced operational risk. Also, if your analytical system comprises many technologies, maintaining your own infrastructure for each can be resource-intensive for a startup.<\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<h3><strong>To Summarise<\/strong><\/h3>\n<p><span style=\"font-weight: 400;\">It is important to design your data system in a way that you can rapidly develop and accommodate future business growth without putting too much cost upfront &#8211; design in tiers, keep things modular and do not put all the pressure on a single technology. Consider access patterns, performance and business criticality to pick right technology or to move data across storage tiers. Do orchestration and automation as you go. Do consider the expertise of the team and exploit the latest powerful analytic tools to reduce the prep time and risk to the business. Always have your best and optimal solution in mind and move towards it as you grow and gain more skills. Spend most of the time to drive value out of your data and rapid integrations which will directly impact the revenue of your business. Secure resources and time to innovate and <\/span><span style=\"font-weight: 400;\">innovate-on-time to stay ahead in the competition in today&#8217;s data driven world.\u00a0<\/span><span style=\"font-size: 16px;\">\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p><span style=\"font-weight: 400;\"><\/span><\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243; custom_padding=&#8221;9px|||||&#8221; global_module=&#8221;29718&#8243; saved_tabs=&#8221;all&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_blurb title=&#8221;Written By Sambodhi Khandelwal&#8221; image=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/04\/T02J6GGTV-U02JKUN3D-cf206e549b01-512.jpg&#8221; icon_placement=&#8221;left&#8221; image_max_width=&#8221;115%&#8221; content_max_width=&#8221;1100px&#8221; _builder_version=&#8221;4.0.6&#8243; border_radii_image=&#8221;off||||&#8221;]<\/p>\n<p><span>CTO &amp; Co-Founder at Streamhub | Technology &amp; Startups<\/span><\/p>\n<p><span><span style=\"font-weight: 400;\">Sambodhi is the driving force behind Streamhub\u2019s technical innovation. Passionate about startup culture as well as solving problems by leveraging modern technologies, she has worked with some of the biggest names in online media including News International, Yahoo! and Thompson Reuters.<\/span><\/span><\/p>\n<p>[\/et_pb_blurb][\/et_pb_column][\/et_pb_row][et_pb_row _builder_version=&#8221;4.0.6&#8243;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;4.0.6&#8243;][et_pb_text _builder_version=&#8221;4.0.6&#8243;]<\/p>\n<p><span style=\"font-size: 16px;\">\u00a0<\/span><\/p>\n<p><span style=\"color: #333333; font-size: 22px;\">References<\/span><span style=\"font-size: 16px;\">\u00a0<\/span><\/p>\n<p><span style=\"font-weight: 400;\"><a target=\"_blank\" class=\"c-link\" delay=\"150\" aria-describedby=\"slack-kit-tooltip\" href=\"https:\/\/dissectingfiction.com\/bandersnatch-choice-and-the-future-of-netflix\/\" rel=\"noopener noreferrer\">Bandersnatch, Choice and the Future of Netflix\u00a0<\/a><\/span><\/p>\n<p><span style=\"font-weight: 400;\"><a href=\"https:\/\/en.wikipedia.org\/wiki\/Black_Mirror:_Bandersnatch\">Black Mirror: Bandersnatch<\/a><span>\u00a0<\/span><\/span><\/p>\n<p><a href=\"https:\/\/www.theguardian.com\/media-network\/media-network-blog\/2015\/feb\/27\/house-cards-netflix-internet-video-kings\"><span style=\"font-weight: 400;\">House of Cards: how Netflix\u2019s $100m gamble made them internet video kings<\/span><\/a><\/p>\n<p><a href=\"https:\/\/aws.amazon.com\/blogs\/big-data\/orchestrating-an-etl-process-using-aws-step-functions-for-amazon-redshift\/\">Orchestrate an ETL process using AWS Step Functions for Amazon Redshift<\/a><\/p>\n<p><a href=\"https:\/\/cloud.google.com\/blog\/products\/gcp\/bigquery-under-the-hood\">Google BigQuery &#8211; Under the hood<\/a><\/p>\n<p><a href=\"https:\/\/aws.amazon.com\/athena\/\">Amazon Athena \u2014 Serverless Interactive Query Service &#8211; AWS<\/a><\/p>\n<p><a href=\"https:\/\/docs.snowflake.net\/manuals\/user-guide\/intro-supported-features.html\">Snowflake\u00a0<\/a><a href=\"https:\/\/aws.amazon.com\/athena\/\">\u2014<\/a><a href=\"https:\/\/docs.snowflake.net\/manuals\/user-guide\/intro-supported-features.html\"> Cloud-based data-warehousing platform<\/a><\/p>\n<p>[\/et_pb_text][\/et_pb_column][\/et_pb_row][\/et_pb_section]<\/p>\n","protected":false},"excerpt":{"rendered":"<p><span class=\"span-reading-time rt-reading-time\" style=\"display: block;\"><span class=\"rt-label rt-prefix\">Reading Time: <\/span> <span class=\"rt-time\"> 11<\/span> <span class=\"rt-label rt-postfix\">minutes<\/span><\/span><\/p>\n<p>Designing a successful data analytics platform for a rapidly changing world Key design principles we learned while building Streamhub\u2019s video data analytics platformWe are in the age of data where you can change the ending of a film if it doesn\u2019t match your taste! Black Mirror&#8217;s &#8216;Bandersnatch&#8217; is Netflix&#8217;s first major interactive film where its [&hellip;]<\/p>\n","protected":false},"author":6,"featured_media":29466,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_et_pb_use_builder":"on","_et_pb_old_content":"","_et_gb_content_width":"","content-type":"","_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":false,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"font":"","enabled":false},"version":2}},"categories":[13],"tags":[81,125,94,130,118,23,127,134,112,121,117,119,120,122,114,129,133,128,132,50,123,80,115,131,113],"class_list":["post-29384","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-news","tag-analytics","tag-athena","tag-audience","tag-automated-workflows","tag-aws","tag-big-data","tag-bigquery","tag-cdp","tag-data","tag-data-architecture","tag-data-lake","tag-data-pipeline","tag-data-science","tag-data-tiering","tag-databases","tag-databricks","tag-dmp","tag-google","tag-lamda","tag-netflix","tag-olap","tag-ott","tag-serverless","tag-snowflake","tag-spark"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v27.6 - https:\/\/yoast.com\/product\/yoast-seo-wordpress\/ -->\n<title>How to design a modern data analytics platform in a rapidly changing world - Streamhub.co.uk<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"How to design a modern data analytics platform in a rapidly changing world - Streamhub.co.uk\" \/>\n<meta property=\"og:description\" content=\"Reading Time:  11 minutesDesigning a successful data analytics platform for a rapidly changing world Key design principles we learned while building Streamhub\u2019s video data analytics platformWe are in the age of data where you can change the ending of a film if it doesn\u2019t match your taste! Black Mirror&#039;s &#039;Bandersnatch&#039; is Netflix&#039;s first major interactive film where its [&hellip;]\" \/>\n<meta property=\"og:url\" content=\"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/\" \/>\n<meta property=\"og:site_name\" content=\"Streamhub.co.uk\" \/>\n<meta property=\"article:published_time\" content=\"2020-02-20T23:43:47+00:00\" \/>\n<meta property=\"article:modified_time\" content=\"2021-06-12T09:21:07+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Cover-3-1024x715.jpg\" \/>\n\t<meta property=\"og:image:width\" content=\"1024\" \/>\n\t<meta property=\"og:image:height\" content=\"715\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/jpeg\" \/>\n<meta name=\"author\" content=\"Sambodhi\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Written by\" \/>\n\t<meta name=\"twitter:data1\" content=\"Sambodhi\" \/>\n\t<meta name=\"twitter:label2\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data2\" content=\"20 minutes\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\\\/\\\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/#article\",\"isPartOf\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/\"},\"author\":{\"name\":\"Sambodhi\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/person\\\/71fc449cd1d4677990e1942bee805c41\"},\"headline\":\"How to design a modern data analytics platform in a rapidly changing world\",\"datePublished\":\"2020-02-20T23:43:47+00:00\",\"dateModified\":\"2021-06-12T09:21:07+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/\"},\"wordCount\":4091,\"publisher\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#organization\"},\"image\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/#primaryimage\"},\"thumbnailUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/02\\\/Cover-3.jpg\",\"keywords\":[\"analytics\",\"Athena\",\"audience\",\"Automated Workflows\",\"AWS\",\"Big Data\",\"BigQuery\",\"CDP\",\"Data\",\"Data Architecture\",\"Data lake\",\"data pipeline\",\"Data Science\",\"Data Tiering\",\"Databases\",\"DataBricks\",\"DMP\",\"Google\",\"LAMDA\",\"Netflix\",\"OLAP\",\"ott\",\"Serverless\",\"Snowflake\",\"Spark\"],\"articleSection\":[\"News\"],\"inLanguage\":\"en-US\"},{\"@type\":\"WebPage\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/\",\"name\":\"How to design a modern data analytics platform in a rapidly changing world - Streamhub.co.uk\",\"isPartOf\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/#primaryimage\"},\"image\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/#primaryimage\"},\"thumbnailUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/02\\\/Cover-3.jpg\",\"datePublished\":\"2020-02-20T23:43:47+00:00\",\"dateModified\":\"2021-06-12T09:21:07+00:00\",\"breadcrumb\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/#primaryimage\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/02\\\/Cover-3.jpg\",\"contentUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/02\\\/Cover-3.jpg\",\"width\":3598,\"height\":2512},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/how-to-design-a-modern-data-analytics-platform\\\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\\\/\\\/streamhub.co.uk\\\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"How to design a modern data analytics platform in a rapidly changing world\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#website\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/\",\"name\":\"Streamhub.co.uk\",\"description\":\"Streamhub.co.uk\",\"publisher\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#organization\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\\\/\\\/streamhub.co.uk\\\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"en-US\"},{\"@type\":\"Organization\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#organization\",\"name\":\"Streamhub.co.uk\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/\",\"logo\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/logo\\\/image\\\/\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/05\\\/SH-Logo.png\",\"contentUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/05\\\/SH-Logo.png\",\"width\":1397,\"height\":361,\"caption\":\"Streamhub.co.uk\"},\"image\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/logo\\\/image\\\/\"},\"sameAs\":[\"https:\\\/\\\/www.linkedin.com\\\/company\\\/3006156\\\/admin\\\/feed\\\/posts\\\/\"]},{\"@type\":\"Person\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/person\\\/71fc449cd1d4677990e1942bee805c41\",\"name\":\"Sambodhi\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g\",\"url\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g\",\"contentUrl\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g\",\"caption\":\"Sambodhi\"},\"url\":false}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"How to design a modern data analytics platform in a rapidly changing world - Streamhub.co.uk","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/","og_locale":"en_US","og_type":"article","og_title":"How to design a modern data analytics platform in a rapidly changing world - Streamhub.co.uk","og_description":"Reading Time:  11 minutesDesigning a successful data analytics platform for a rapidly changing world Key design principles we learned while building Streamhub\u2019s video data analytics platformWe are in the age of data where you can change the ending of a film if it doesn\u2019t match your taste! Black Mirror's 'Bandersnatch' is Netflix's first major interactive film where its [&hellip;]","og_url":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/","og_site_name":"Streamhub.co.uk","article_published_time":"2020-02-20T23:43:47+00:00","article_modified_time":"2021-06-12T09:21:07+00:00","og_image":[{"width":1024,"height":715,"url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Cover-3-1024x715.jpg","type":"image\/jpeg"}],"author":"Sambodhi","twitter_card":"summary_large_image","twitter_misc":{"Written by":"Sambodhi","Est. reading time":"20 minutes"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/#article","isPartOf":{"@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/"},"author":{"name":"Sambodhi","@id":"https:\/\/streamhub.co.uk\/#\/schema\/person\/71fc449cd1d4677990e1942bee805c41"},"headline":"How to design a modern data analytics platform in a rapidly changing world","datePublished":"2020-02-20T23:43:47+00:00","dateModified":"2021-06-12T09:21:07+00:00","mainEntityOfPage":{"@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/"},"wordCount":4091,"publisher":{"@id":"https:\/\/streamhub.co.uk\/#organization"},"image":{"@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/#primaryimage"},"thumbnailUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Cover-3.jpg","keywords":["analytics","Athena","audience","Automated Workflows","AWS","Big Data","BigQuery","CDP","Data","Data Architecture","Data lake","data pipeline","Data Science","Data Tiering","Databases","DataBricks","DMP","Google","LAMDA","Netflix","OLAP","ott","Serverless","Snowflake","Spark"],"articleSection":["News"],"inLanguage":"en-US"},{"@type":"WebPage","@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/","url":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/","name":"How to design a modern data analytics platform in a rapidly changing world - Streamhub.co.uk","isPartOf":{"@id":"https:\/\/streamhub.co.uk\/#website"},"primaryImageOfPage":{"@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/#primaryimage"},"image":{"@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/#primaryimage"},"thumbnailUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Cover-3.jpg","datePublished":"2020-02-20T23:43:47+00:00","dateModified":"2021-06-12T09:21:07+00:00","breadcrumb":{"@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/"]}]},{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/#primaryimage","url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Cover-3.jpg","contentUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Cover-3.jpg","width":3598,"height":2512},{"@type":"BreadcrumbList","@id":"https:\/\/streamhub.co.uk\/how-to-design-a-modern-data-analytics-platform\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/streamhub.co.uk\/"},{"@type":"ListItem","position":2,"name":"How to design a modern data analytics platform in a rapidly changing world"}]},{"@type":"WebSite","@id":"https:\/\/streamhub.co.uk\/#website","url":"https:\/\/streamhub.co.uk\/","name":"Streamhub.co.uk","description":"Streamhub.co.uk","publisher":{"@id":"https:\/\/streamhub.co.uk\/#organization"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/streamhub.co.uk\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"},{"@type":"Organization","@id":"https:\/\/streamhub.co.uk\/#organization","name":"Streamhub.co.uk","url":"https:\/\/streamhub.co.uk\/","logo":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/streamhub.co.uk\/#\/schema\/logo\/image\/","url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/05\/SH-Logo.png","contentUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/05\/SH-Logo.png","width":1397,"height":361,"caption":"Streamhub.co.uk"},"image":{"@id":"https:\/\/streamhub.co.uk\/#\/schema\/logo\/image\/"},"sameAs":["https:\/\/www.linkedin.com\/company\/3006156\/admin\/feed\/posts\/"]},{"@type":"Person","@id":"https:\/\/streamhub.co.uk\/#\/schema\/person\/71fc449cd1d4677990e1942bee805c41","name":"Sambodhi","image":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/secure.gravatar.com\/avatar\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g","url":"https:\/\/secure.gravatar.com\/avatar\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g","caption":"Sambodhi"},"url":false}]}},"views":142,"jetpack_publicize_connections":[],"jetpack_featured_media_url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/02\/Cover-3.jpg","jetpack_sharing_enabled":true,"_links":{"self":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts\/29384","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/users\/6"}],"replies":[{"embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/comments?post=29384"}],"version-history":[{"count":37,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts\/29384\/revisions"}],"predecessor-version":[{"id":32288,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts\/29384\/revisions\/32288"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/media\/29466"}],"wp:attachment":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/media?parent=29384"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/categories?post=29384"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/tags?post=29384"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}