{"id":28376,"date":"2019-11-01T12:54:22","date_gmt":"2019-11-01T12:54:22","guid":{"rendered":"http:\/\/streamhub.co.uk\/?p=28376"},"modified":"2020-06-30T05:09:26","modified_gmt":"2020-06-30T05:09:26","slug":"apache-spark-tuning-manual","status":"publish","type":"post","link":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/","title":{"rendered":"Apache Spark Tuning Manual"},"content":{"rendered":"<span class=\"span-reading-time rt-reading-time\" style=\"display: block;\"><span class=\"rt-label rt-prefix\">Reading Time: <\/span> <span class=\"rt-time\"> 20<\/span> <span class=\"rt-label rt-postfix\">minutes<\/span><\/span>\n[et_pb_section fb_built=&#8221;1&#8243; admin_label=&#8221;Header&#8221; _builder_version=&#8221;3.27.4&#8243; custom_padding=&#8221;0px|0px|0|0px|false|false&#8221; locked=&#8221;off&#8221;][et_pb_row use_custom_gutter=&#8221;on&#8221; gutter_width=&#8221;2&#8243; _builder_version=&#8221;3.27.4&#8243; max_width=&#8221;1280px&#8221; use_custom_width=&#8221;on&#8221; custom_width_px=&#8221;1280px&#8221;][et_pb_column type=&#8221;4_4&#8243; _builder_version=&#8221;3.25&#8243; custom_padding=&#8221;|||&#8221; custom_padding__hover=&#8221;|||&#8221;][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;Work Sans|700|||||||&#8221; text_text_color=&#8221;#000000&#8243; text_font_size=&#8221;55px&#8221; text_line_height=&#8221;1.3em&#8221; ul_font=&#8221;||||||||&#8221; ol_font=&#8221;||||||||&#8221; header_font=&#8221;Work Sans|700|||||||&#8221; header_font_size=&#8221;55px&#8221; header_line_height=&#8221;1.5em&#8221; header_3_font=&#8221;||||||||&#8221; header_4_font=&#8221;||||||||&#8221; header_5_font=&#8221;||||||||&#8221; header_6_font=&#8221;||||||||&#8221; custom_padding=&#8221;||10px||false|false&#8221; text_font_size_tablet=&#8221;40px&#8221; text_font_size_phone=&#8221;30px&#8221; text_font_size_last_edited=&#8221;on|desktop&#8221; header_font_size_tablet=&#8221;40px&#8221; header_font_size_phone=&#8221;30px&#8221; header_font_size_last_edited=&#8221;on|desktop&#8221; border_color_all=&#8221;#000000&#8243; border_width_bottom=&#8221;4px&#8221; locked=&#8221;off&#8221; inline_fonts=&#8221;Times New Roman&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<p><span style=\"font-family: 'Times New Roman'; font-weight: normal;\">Master Spark fundamentals &amp; optimizations\u00a0<\/span><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;Work Sans|600|||||||&#8221; text_line_height=&#8221;1.4em&#8221; link_font=&#8221;||||||||&#8221; link_text_color=&#8221;#1a0a38&#8243; locked=&#8221;off&#8221; inline_fonts=&#8221;Times New Roman&#8221;]<h3><strong><span style=\"font-family: 'Times New Roman';\"><span style=\"color: #333333;\">Streamhub&#8217;s all-in-one notes on how to optimize &amp; scale Spark jobs for something more than a POC<\/span><\/span><\/strong><\/h3>[\/et_pb_text][\/et_pb_column][\/et_pb_row][\/et_pb_section][et_pb_section fb_built=&#8221;1&#8243; use_custom_gutter=&#8221;on&#8221; gutter_width=&#8221;2&#8243; specialty=&#8221;on&#8221; padding_1_tablet=&#8221;0px||&#8221; padding_1_last_edited=&#8221;off|desktop&#8221; admin_label=&#8221;Content&#8221; _builder_version=&#8221;3.22&#8243; background_size=&#8221;contain&#8221; background_position=&#8221;top_center&#8221; inner_max_width=&#8221;1280px&#8221; custom_margin=&#8221;||-74px|||&#8221; custom_padding=&#8221;0|0px|54px|0px|false|false&#8221; use_custom_width=&#8221;on&#8221; custom_width_px=&#8221;1280px&#8221; background_size__hover=&#8221;cover&#8221; background_size__hover_enabled=&#8221;cover&#8221; background_position__hover=&#8221;center&#8221; background_position__hover_enabled=&#8221;center&#8221; use_background_color_gradient__hover=&#8221;off&#8221; use_background_color_gradient__hover_enabled=&#8221;off&#8221; background_color_gradient_start__hover=&#8221;#2b87da&#8221; background_color_gradient_start__hover_enabled=&#8221;#2b87da&#8221; background_color_gradient_end__hover=&#8221;#29c4a9&#8243; background_color_gradient_end__hover_enabled=&#8221;#29c4a9&#8243; background_color_gradient_type__hover=&#8221;linear&#8221; background_color_gradient_type__hover_enabled=&#8221;linear&#8221; background_color_gradient_direction__hover=&#8221;180deg&#8221; background_color_gradient_direction__hover_enabled=&#8221;180deg&#8221; background_color_gradient_direction_radial__hover=&#8221;center&#8221; background_color_gradient_direction_radial__hover_enabled=&#8221;center&#8221; background_color_gradient_start_position__hover=&#8221;0%&#8221; background_color_gradient_start_position__hover_enabled=&#8221;0%&#8221; background_color_gradient_end_position__hover=&#8221;100%&#8221; background_color_gradient_end_position__hover_enabled=&#8221;100%&#8221; background_color_gradient_overlays_image__hover=&#8221;off&#8221; background_color_gradient_overlays_image__hover_enabled=&#8221;off&#8221; parallax__hover=&#8221;off&#8221; parallax__hover_enabled=&#8221;off&#8221; parallax_method__hover=&#8221;on&#8221; parallax_method__hover_enabled=&#8221;on&#8221; background_repeat__hover=&#8221;no-repeat&#8221; background_repeat__hover_enabled=&#8221;no-repeat&#8221; background_blend__hover=&#8221;normal&#8221; background_blend__hover_enabled=&#8221;normal&#8221; allow_player_pause__hover=&#8221;off&#8221; allow_player_pause__hover_enabled=&#8221;off&#8221; background_video_pause_outside_viewport__hover=&#8221;on&#8221; background_video_pause_outside_viewport__hover_enabled=&#8221;on&#8221;][et_pb_column type=&#8221;3_4&#8243; specialty_columns=&#8221;3&#8243; _builder_version=&#8221;3.25&#8243; custom_padding=&#8221;|||&#8221; padding_tablet__hover=&#8221;0px||&#8221; padding_last_edited__hover=&#8221;off|desktop&#8221; parallax__hover=&#8221;off&#8221; parallax_method__hover=&#8221;on&#8221; padding_tablet=&#8221;0px||&#8221; padding_last_edited=&#8221;off|desktop&#8221; custom_padding__hover=&#8221;|||&#8221;][et_pb_row_inner _builder_version=&#8221;3.27.4&#8243; min_height=&#8221;3546px&#8221; locked=&#8221;off&#8221; parallax_1__hover=&#8221;off&#8221; parallax_1__hover_enabled=&#8221;off&#8221; parallax_2__hover=&#8221;off&#8221; parallax_2__hover_enabled=&#8221;off&#8221; parallax_3__hover=&#8221;off&#8221; parallax_3__hover_enabled=&#8221;off&#8221; parallax_4__hover=&#8221;off&#8221; parallax_4__hover_enabled=&#8221;off&#8221; parallax_method_1__hover=&#8221;on&#8221; parallax_method_1__hover_enabled=&#8221;on&#8221; parallax_method_2__hover=&#8221;on&#8221; parallax_method_2__hover_enabled=&#8221;on&#8221; parallax_method_3__hover=&#8221;on&#8221; parallax_method_3__hover_enabled=&#8221;on&#8221; parallax_method_4__hover=&#8221;on&#8221; parallax_method_4__hover_enabled=&#8221;on&#8221; use_background_color_gradient__hover=&#8221;off&#8221; use_background_color_gradient__hover_enabled=&#8221;off&#8221; background_color_gradient_start__hover=&#8221;#2b87da&#8221; background_color_gradient_start__hover_enabled=&#8221;#2b87da&#8221; background_color_gradient_end__hover=&#8221;#29c4a9&#8243; background_color_gradient_end__hover_enabled=&#8221;#29c4a9&#8243; background_color_gradient_type__hover=&#8221;linear&#8221; background_color_gradient_type__hover_enabled=&#8221;linear&#8221; background_color_gradient_direction__hover=&#8221;180deg&#8221; background_color_gradient_direction__hover_enabled=&#8221;180deg&#8221; background_color_gradient_direction_radial__hover=&#8221;center&#8221; background_color_gradient_direction_radial__hover_enabled=&#8221;center&#8221; background_color_gradient_start_position__hover=&#8221;0%&#8221; background_color_gradient_start_position__hover_enabled=&#8221;0%&#8221; background_color_gradient_end_position__hover=&#8221;100%&#8221; background_color_gradient_end_position__hover_enabled=&#8221;100%&#8221; background_color_gradient_overlays_image__hover=&#8221;off&#8221; background_color_gradient_overlays_image__hover_enabled=&#8221;off&#8221; parallax__hover=&#8221;off&#8221; parallax__hover_enabled=&#8221;off&#8221; parallax_method__hover=&#8221;on&#8221; parallax_method__hover_enabled=&#8221;on&#8221; background_size__hover=&#8221;cover&#8221; background_size__hover_enabled=&#8221;cover&#8221; background_position__hover=&#8221;center&#8221; background_position__hover_enabled=&#8221;center&#8221; background_repeat__hover=&#8221;no-repeat&#8221; background_repeat__hover_enabled=&#8221;no-repeat&#8221; background_blend__hover=&#8221;normal&#8221; background_blend__hover_enabled=&#8221;normal&#8221; allow_player_pause__hover=&#8221;off&#8221; allow_player_pause__hover_enabled=&#8221;off&#8221; background_video_pause_outside_viewport__hover=&#8221;on&#8221; background_video_pause_outside_viewport__hover_enabled=&#8221;on&#8221; use_custom_gutter__hover=&#8221;off&#8221; use_custom_gutter__hover_enabled=&#8221;off&#8221; gutter_width__hover=&#8221;3&#8243; gutter_width__hover_enabled=&#8221;3&#8243; make_equal__hover=&#8221;off&#8221; make_equal__hover_enabled=&#8221;off&#8221; border_radii__hover=&#8221;on||||&#8221; border_radii__hover_enabled=&#8221;on||||&#8221; box_shadow_style__hover=&#8221;none&#8221; box_shadow_style__hover_enabled=&#8221;none&#8221; box_shadow_color__hover=&#8221;rgba(0,0,0,0.3)&#8221; box_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.3)&#8221; max_width__hover=&#8221;100%&#8221; max_width__hover_enabled=&#8221;100%&#8221; filter_hue_rotate__hover=&#8221;0deg&#8221; filter_hue_rotate__hover_enabled=&#8221;0deg&#8221; filter_saturate__hover=&#8221;100%&#8221; filter_saturate__hover_enabled=&#8221;100%&#8221; filter_brightness__hover=&#8221;100%&#8221; filter_brightness__hover_enabled=&#8221;100%&#8221; filter_contrast__hover=&#8221;100%&#8221; filter_contrast__hover_enabled=&#8221;100%&#8221; filter_invert__hover=&#8221;0%&#8221; filter_invert__hover_enabled=&#8221;0%&#8221; filter_sepia__hover=&#8221;0%&#8221; filter_sepia__hover_enabled=&#8221;0%&#8221; filter_opacity__hover=&#8221;100%&#8221; filter_opacity__hover_enabled=&#8221;100%&#8221; filter_blur__hover=&#8221;0px&#8221; filter_blur__hover_enabled=&#8221;0px&#8221; mix_blend_mode__hover=&#8221;normal&#8221; mix_blend_mode__hover_enabled=&#8221;normal&#8221; animation_style__hover=&#8221;none&#8221; animation_style__hover_enabled=&#8221;none&#8221; animation_repeat__hover=&#8221;once&#8221; animation_repeat__hover_enabled=&#8221;once&#8221; animation_direction__hover=&#8221;center&#8221; animation_direction__hover_enabled=&#8221;center&#8221; animation_duration__hover=&#8221;1000ms&#8221; animation_duration__hover_enabled=&#8221;1000ms&#8221; animation_delay__hover=&#8221;0ms&#8221; animation_delay__hover_enabled=&#8221;0ms&#8221; animation_intensity_slide__hover=&#8221;50%&#8221; animation_intensity_slide__hover_enabled=&#8221;50%&#8221; animation_intensity_zoom__hover=&#8221;50%&#8221; animation_intensity_zoom__hover_enabled=&#8221;50%&#8221; animation_intensity_flip__hover=&#8221;50%&#8221; animation_intensity_flip__hover_enabled=&#8221;50%&#8221; animation_intensity_fold__hover=&#8221;50%&#8221; animation_intensity_fold__hover_enabled=&#8221;50%&#8221; animation_intensity_roll__hover=&#8221;50%&#8221; animation_intensity_roll__hover_enabled=&#8221;50%&#8221; animation_starting_opacity__hover=&#8221;0%&#8221; animation_starting_opacity__hover_enabled=&#8221;0%&#8221; animation_speed_curve__hover=&#8221;ease-in-out&#8221; animation_speed_curve__hover_enabled=&#8221;ease-in-out&#8221; hover_transition_duration__hover=&#8221;300ms&#8221; hover_transition_duration__hover_enabled=&#8221;300ms&#8221; hover_transition_delay__hover=&#8221;0ms&#8221; hover_transition_delay__hover_enabled=&#8221;0ms&#8221; hover_transition_speed_curve__hover=&#8221;ease&#8221; hover_transition_speed_curve__hover_enabled=&#8221;ease&#8221;][et_pb_column_inner saved_specialty_column_type=&#8221;3_4&#8243; _builder_version=&#8221;3.25&#8243; custom_padding=&#8221;|||&#8221; parallax__hover=&#8221;off&#8221; parallax__hover_enabled=&#8221;off&#8221; parallax_method__hover=&#8221;on&#8221; parallax_method__hover_enabled=&#8221;on&#8221; custom_padding__hover=&#8221;|||&#8221;][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/high-volume-analytics-resize.jpg&#8221; align=&#8221;center&#8221; align_tablet=&#8221;center&#8221; align_phone=&#8221;&#8221; align_last_edited=&#8221;on|desktop&#8221; _builder_version=&#8221;3.27.4&#8243; width=&#8221;100%&#8221; max_width=&#8221;100%&#8221; min_height=&#8221;263px&#8221; height=&#8221;349px&#8221; max_height=&#8221;399px&#8221; custom_margin=&#8221;-29px||||false|false&#8221; custom_padding=&#8221;0px||||false|false&#8221; border_radii=&#8221;on|5px|5px|5px|5px&#8221; border_width_all=&#8221;5px&#8221; border_color_all=&#8221;#002f38&#8243; locked=&#8221;off&#8221;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<p class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><span style=\"font-size: 16px;\">At Streamhub, we deal with many forms of data like user\u2019s (every minute) player viewership data, user\u2019s commerce data, user\u2019s subscription information, metadata feeds, metadata front-loaded, user panels, and several other third-party datasets. It is easy to imagine the amount of work required to unify these various forms of data from various sources and ingest them into our ecosystem and making them richer, standard and useful to our clients. At some point, we were running hundreds of jobs daily, crunching over terabytes of data to support our extensive use-cases. All this heavy lifting is done by Apache Spark running over AWS Elastic MapReduce, in the Yarn mode.<\/span><\/p>\n<p id=\"6695\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">In the last two years, we have learned massively about how to optimize Spark jobs over EMR. Writing your first job in Spark might take a day or two but learning to tune Spark takes months if not years. This article was initially created as internal team notes which I am now publishing hoping more people can benefit from it. This covers almost everything you should know to run your Spark jobs efficiently (on EMR\/Yarn but covers general aspects as well) which comprises of performance optimizations, resource allocations, cost reductions, understanding important concepts, common mistakes in coding\/configuring jobs, common exceptions and some must-reads\/watch about Spark we have found over time. Right, literally everything!<span style=\"font-size: 16px;\"><\/span><\/p>\n<p class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><a href=\"https:\/\/medium.com\/@sambodhi_72782\/spark-tuning-manual-47b98ccb2b2c\"><\/a><\/p>\n<p id=\"a400\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">This article will touch upon the following topics:<span style=\"font-size: 16px;\">\u00a0<\/span><\/p>[\/et_pb_text][et_pb_text quote_border_weight=&#8221;14px&#8221; quote_border_color=&#8221;#00ac69&#8243; _builder_version=&#8221;3.27.4&#8243; link_font=&#8221;||||||||&#8221; quote_font=&#8221;Work Sans|700|||||||&#8221; quote_text_color=&#8221;#000000&#8243; quote_font_size=&#8221;18px&#8221; quote_line_height=&#8221;1.8em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width=&#8221;700px&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<blockquote>\n<p class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><em>EMR Instance-types: Understand your workload and use specialized instances<\/em><br \/><em>EMR Purchasing options: choose the right option to optimize cost for your use-case: Spot or Spot Fleet or Spot blocking<\/em><br \/><em>Resource utilization: Configure your spark cluster to fully use your resources<\/em><br \/><em>\u2014 Static resource allocation<\/em><br \/><em>\u2014 \u2014 Understanding Spark\u2019s memory usage is important<\/em><br \/><em>\u2014 Dynamic resource allocation<\/em><br \/><em>GC Tuning<\/em><br \/><em>Bad code and related problems<\/em><br \/><em>\u2014 Avoid shuffle<\/em><br \/><em>\u2014 Some preferred methods: ReduceByKey over GroupByKey<\/em><br \/><em>\u2014 Avoid serialization of the whole object<\/em><br \/><em>\u2014 Operations reordering<\/em><br \/><em>\u2014 Improve Joins<\/em><br \/><em>\u2014 \u2014 Using Broadcast variable (Joining very large datasets with a relatively small dataset)<\/em><br \/><em>\u2014 \u2014 Using filters pre-join (Joining very large datasets with a mid-sized dataset)<\/em><br \/><em>Degree of parallelism<\/em><br \/><em>\u2014 Repartition<\/em><br \/><em>\u2014 Coalesce<\/em><br \/><em>More on partitioners<\/em><br \/><em>Choice of serializer<\/em><br \/><em>Dedicated Spark local directories<\/em><br \/><em>Writing to databases<\/em><br \/><em>Data storage formats: Parquet, ORC, Databricks Delta &amp; compression options<\/em><br \/><em>Caching intermediate datasets<\/em><br \/><em>Use higher-level APIs: Dataframes\/Datasets<\/em><\/p>\n<\/blockquote>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; custom_padding=&#8221;||20px|||&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<p class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">This article was originally on Medium, in case you find this format is easier to read:\u00a0\u00a0<a href=\"https:\/\/medium.com\/@sambodhi_72782\/spark-tuning-manual-47b98ccb2b2c\">https:\/\/medium.com\/@sambodhi_72782\/spark-tuning-manual-47b98ccb2b2c<\/a><\/p>\n<p id=\"e941\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Reducing your processing cost is important, also because lower is the cost, better resources you can get for your spending and lower effort you have to make in improving the performance.<\/p>\n<h3 class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx gy\"><span style=\"font-size: x-large;\"><strong><\/strong><\/span><\/h3>\n<h3 class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx gy\"><span style=\"font-size: x-large;\"><strong><\/strong><\/span><\/h3>\n<h3 id=\"bf9c\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx gy\"><span style=\"font-size: x-large;\"><strong>I<\/strong><\/span>nstance-types: Understand your workload and use specialised instances<\/h3>\n<ul class=\"\">\n<li class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx hi hj hk\" data-selectable-paragraph=\"\">ML mostly CPU intensive, ETL mostly IO intensive<\/li>\n<li id=\"742c\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx hi hj hk\" data-selectable-paragraph=\"\">Spark is incredibly memory intensive, we use memory optimized instance types like r4 or newer r5 family instances.<\/li>\n<\/ul>\n<h3 class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx gy\"><strong class=\"fm fy\"><span style=\"font-size: x-large;\"><\/span><\/strong><\/h3>\n<h3 id=\"3128\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx gy\"><strong class=\"fm fy\"><span style=\"font-size: x-large;\">P<\/span><\/strong>urchasing options: choose the right option to optimize cost for your<\/h3>\n<h3 class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx gy\">use-case: Spot or Spot Fleet or Spot blocking<\/h3>\n<ul class=\"\">\n<li id=\"7a7b\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx hi hj hk\" data-selectable-paragraph=\"\">Spot: Spot instances can help reduce your EC2 costs by 40\u201380%.<\/li>\n<li id=\"4f1c\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx hi hj hk\" data-selectable-paragraph=\"\">Spot Fleet: The advantage of using a Spot Fleet is that instead of specifying the instance types you want, you can specify your computing and memory<span>\u00a0<\/span><em class=\"gx\">capacity requirements<\/em>. Also, you can specify<span>\u00a0<\/span><em class=\"gx\">multiple subnets<\/em>\/availability zones. AWS will provide the available instances which fulfill that requirement. This profoundly increases your chance of getting Spot Instances.<\/li>\n<li id=\"87bb\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx hi hj hk\" data-selectable-paragraph=\"\">Spot Block: Spot Instances with a<span>\u00a0<\/span><em class=\"gx\">specified duration<\/em><span>\u00a0<\/span>are designed not to be interrupted and will run continuously for the duration you select. AWS charges more for blocking, but still much less than on-demand. You can also specify the duration for which it will continue to look for spot instances before switching to on-demand instances.<\/li>\n<\/ul>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/0_93vUPwwg0WnAsHZ4.png&#8221; _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#000000&#8243; width=&#8221;54%&#8221; module_alignment=&#8221;center&#8221; custom_margin=&#8221;9px||||false|false&#8221; custom_padding=&#8221;0px||||false|false&#8221;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; custom_padding=&#8221;6px|||||&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<blockquote>\n<p>Good reads:<\/p>\n<p><a href=\"https:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/spot-requests.html#fixed-duration-spot-instances\" data-href=\"https:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/spot-requests.html#fixed-duration-spot-instances\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">https:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/spot-requests.html#fixed-duration-spot-instances<\/a><\/p>\n<p><a href=\"https:\/\/aws.amazon.com\/blogs\/aws\/new-ec2-spot-blocks-for-defined-duration-workloads\/\" data-href=\"https:\/\/aws.amazon.com\/blogs\/aws\/new-ec2-spot-blocks-for-defined-duration-workloads\/\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">https:\/\/aws.amazon.com\/blogs\/aws\/new-ec2-spot-blocks-for-defined-duration-workloads<\/a><\/p>\n<\/blockquote>\n<p><span>A common problem we noticed related to this was instances being blacklisted from cluster or cluster resizing unexpectedly. It is more common to see this in long-running jobs when it tries to reacquire the instances and causing the tasks\/jobs to fail, which after a couple of retries may lead to this. You can see the instances blacklisted in Spark console and you can check EMR console if see if the problem was due to instances being unavailable:<\/span><\/p>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/0_Ia95fwG1qbAQRocS.png&#8221; _builder_version=&#8221;3.27.4&#8243;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gz ha hb hc hd he hf hg hh bo\"><\/span><\/strong><\/span><\/h3>\n<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gz ha hb hc hd he hf hg hh bo\"><\/span><\/strong><\/span><\/h3>\n<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gz ha hb hc hd he hf hg hh bo\">R<\/span><\/strong><\/span>esource utilisation: Configure your spark cluster to fully use your resources<\/h3>\n<h4><strong class=\"fm fy\" style=\"font-size: 16px;\"><\/strong><\/h4>\n<h4><strong class=\"fm fy\" style=\"font-size: 16px;\"><\/strong><\/h4>\n<h4><span style=\"font-size: large;\"><strong class=\"fm fy\">Static resource allocation<\/strong><\/span><\/h4>\n<p><strong class=\"fm fy\"><\/strong><\/p>\n<p id=\"6fbe\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">When you configure your cluster, to fully utilize your resources ensure the following equation holds true:<\/p>[\/et_pb_text][\/et_pb_column_inner][\/et_pb_row_inner][et_pb_row_inner _builder_version=&#8221;3.27.4&#8243; custom_margin=&#8221;-95px||50px||false|false&#8221; custom_padding=&#8221;0px||||false|false&#8221;][et_pb_column_inner saved_specialty_column_type=&#8221;3_4&#8243; _builder_version=&#8221;3.25&#8243; custom_padding=&#8221;|||&#8221; parallax__hover=&#8221;off&#8221; parallax_method__hover=&#8221;on&#8221; custom_padding__hover=&#8221;|||&#8221;][et_pb_text quote_border_weight=&#8221;14px&#8221; quote_border_color=&#8221;#00ac69&#8243; _builder_version=&#8221;3.27.4&#8243; link_font=&#8221;||||||||&#8221; quote_font=&#8221;Work Sans|700|||||||&#8221; quote_text_color=&#8221;#000000&#8243; quote_font_size=&#8221;18px&#8221; quote_line_height=&#8221;1.8em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width=&#8221;700px&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<blockquote>\n<p id=\"c547\" class=\"fk fl co gx fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><span>M = spark.executor.memory + spark.yarn.executor.memoryOverhead (by default 0.1 of executor.memory) &lt; container-memory.<\/span><\/p>\n<\/blockquote>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<p id=\"0b6d\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Where \u2018Container memory\u2019 is the amount of physical memory that can be allocated per container. According to<span>\u00a0<\/span><a href=\"https:\/\/www.cloudera.com\/documentation\/enterprise\/5-4-x\/topics\/cdh_ig_running_spark_on_yarn.html\" class=\"br de hx hy hz ia\" target=\"_blank\" rel=\"noopener noreferrer\">Cloudera documentation<\/a>,<span>\u00a0<\/span><em class=\"gx\">when running Spark on YARN, each Spark executor runs as a YARN container.<span>\u00a0<\/span><\/em>Multiple executors (and therefore containers) can run in one instance, where M cannot be less than<span>\u00a0<\/span><em class=\"gx\">yarn.scheduler.minimum-allocation-mb<\/em><span>\u00a0<\/span>or more than<span>\u00a0<\/span><em class=\"gx\">yarn.scheduler.maximum-allocation-mb<\/em><span>\u00a0<\/span>and sum M for all executors\/containers on a single host cannot be more than yarn.nodemanager.resource.memory-mb<em class=\"gx\">.<\/em><\/p>\n<p id=\"54e3\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Exceptions related to this \u2014 INFO Client: <strong>Verifying our application has not requested more than the maximum memory capacity of the cluster\u00a0<em class=\"gx\">java.lang. IllegalArgumentException: Required executor memory (17408), overhead (1740 MB),\u00a0<\/em>is above the max threshold<\/strong><em class=\"gx\"><strong>\u00a0(12288 MB) of this cluster!<\/strong> Please check the values of \u2018yarn.scheduler.maximum-allocation-mb\u2019 and\/or \u2018yarn.nodemanager.resource.memory-mb\u2019<\/em>. If you are on the EMR version before 4.1.0, there are more chances to see this error. Why? Because before this certain instance type had yarn.scheduler.maximum-allocation-mb set to a lower value than yarn.nodemanager.resource.memory-mb. So for certain configurations like<span>\u00a0<\/span><em class=\"gx\">maximizeResourceAllocation<\/em>, it might try to use all the available memory in which case it would hit the max threshold. With release 4.1.0, AWS fixed this to make yarn.scheduler.maximum-allocation-mb equal to yarn.nodemanager.resource.memory-mb which fixes this problem. You can check the values of these configuration parameters for different instance types<span>\u00a0<\/span><a href=\"https:\/\/docs.aws.amazon.com\/emr\/latest\/ReleaseGuide\/emr-hadoop-task-config.html\" class=\"br de hx hy hz ia\" target=\"_blank\" rel=\"noopener noreferrer\">here<\/a>.<\/p>\n<p id=\"4b84\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Easier to understand with an example &#8211;<\/p>\n<ul class=\"\">\n<li id=\"88cc\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx hi hj hk\" data-selectable-paragraph=\"\">Let\u2019s say we are using<span>\u00a0<\/span><em class=\"gx\">r4.4xlarge instances<\/em><span>\u00a0<\/span>with 16 vCPUs, 130 GB RAM but available memory for containers is 116736 MB.<\/li>\n<li id=\"6087\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx hi hj hk\" data-selectable-paragraph=\"\">Leave one core for the OS and let\u2019s say we set the number of<span>\u00a0<\/span><em class=\"gx\">executor-cores<\/em><span>\u00a0<\/span>as 3 (16\u20131 = 15).<\/li>\n<li id=\"f57d\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx hi hj hk\" data-selectable-paragraph=\"\">Calculate the number of executors: Number of executors per instance = (total number of virtual cores per instance \u2014 1) \/<span>\u00a0<\/span><em class=\"gx\">executor-cores<span>\u00a0<\/span><\/em>with 3 cores per executor, we can have 5 executors-per-instance (16\u20131) \/ 3 = 5. So if our cluster has 4 instances, num-executors (in cluster) = 5 * 4 = 20.<\/li>\n<li id=\"3a49\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx hi hj hk\" data-selectable-paragraph=\"\">Set<span>\u00a0<\/span><em class=\"gx\">spark.executor.memory<\/em><span>\u00a0<\/span>18 GB. Therefore, in our case: (18 + (0.1 * 18)) * 5 = 99GB &lt; 116736MB (container memory)<\/li>\n<\/ul>\n<p id=\"4ece\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">This is just an example of how you can configure your cluster to make maximum utilization of the resources, obviously, there are many different configurations possible. You need to experiment with them and see what works best for your application.<\/p>\n<p id=\"cfbd\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><span style=\"text-decoration: underline;\">Understanding Spark\u2019s memory usage is important<\/span><\/p>\n<p id=\"924c\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Memory usage largely falls into two categories: execution and storage. Few points to grasp to understand the memory model:<\/p>[\/et_pb_text][et_pb_text quote_border_weight=&#8221;14px&#8221; quote_border_color=&#8221;#00ac69&#8243; _builder_version=&#8221;3.27.4&#8243; link_font=&#8221;||||||||&#8221; quote_font=&#8221;Work Sans|700|||||||&#8221; quote_text_color=&#8221;#000000&#8243; quote_font_size=&#8221;18px&#8221; quote_line_height=&#8221;1.8em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width=&#8221;700px&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<blockquote>\n<p id=\"c547\" class=\"fk fl co gx fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><span>M = spark.executor.memory + spark.yarn.executor.memoryOverhead (by default 0.1 of executor.memory) &lt; container-memory.<\/span><\/p>\n<\/blockquote>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<p id=\"4381\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Where M is the unified shared memory between execution and storage. The default value of spark.memory.fraction is 0.6 or 60%<\/p>\n<p id=\"7095\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">The rest of 40% memory is used as \u2018user memory\u2019 reserved for user data structures, internal metadata, etc<\/p>[\/et_pb_text][et_pb_text quote_border_weight=&#8221;14px&#8221; quote_border_color=&#8221;#00ac69&#8243; _builder_version=&#8221;3.27.4&#8243; link_font=&#8221;||||||||&#8221; quote_font=&#8221;Work Sans|700|||||||&#8221; quote_text_color=&#8221;#000000&#8243; quote_font_size=&#8221;18px&#8221; quote_line_height=&#8221;1.8em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width=&#8221;700px&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<blockquote>\n<p id=\"b6b0\" class=\"fk fl co gx fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><em class=\"au\">M = execution memory + storage memory<\/em><\/p>\n<p id=\"2a68\" class=\"fk fl co gx fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><em class=\"au\">Execution memory &lt;= M * (1 \u2014 spark.memory.storageFraction)<\/em><\/p>\n<p id=\"f304\" class=\"fk fl co gx fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><em class=\"au\">Storage memory &gt;= M * spark.memory.storageFraction<\/em><\/p>\n<\/blockquote>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p><span>Spark may evict the storage memory if required by Execution but only until total storage memory usage falls under this threshold defined by spark.memory.storageFraction. Default value of spark.memory.storageFraction is 0.5.<\/span><\/p>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/1_sFs6oECXHz49FW6PnaDL5A.png&#8221; _builder_version=&#8221;3.27.4&#8243;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<p id=\"bf23\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">One of the common exceptions related to memory is &#8211; \u2018<strong><em class=\"gx\">ExecutorLostFailure Reason: Container killed by YARN for exceeding memory limits. 2.5 GB of 2.5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead<\/em>\u2019<\/strong><\/p>\n<p id=\"7abe\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Memory overhead is the amount of off-heap memory allocated to each executor. Memory overhead is used for buffers and thread stacks. By default, it is set to either 10% of executor memory. You can try to increase this but it should respect the equation above i.e. (spark.executor.memory + spark.yarn.executor.memoryOverhead) *num-of-executors-per-instance &lt; container memory.<\/p>\n<p id=\"eb13\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Because this is about off-heap memory, there can be other (and better) ways to fix this like reducing executor core (since this would reduce the max number of tasks which reduces the required memory) or increasing number of partitions (since there are more partitions, amount of memory required per partition is less) or increasing executor memory (since this value is a factor of executor memory, increasing executor memory would increase memory overhead).<\/p>\n<p id=\"3b1a\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Another most common exception related is <strong><em>OutOfMemory<\/em><\/strong>, which can happen for various reasons like misconfiguring the clusters \u2014 over utilising the memory, objects persisted only in memory not spilling on disk, too much shuffling or insufficient user memory. For example, you are doing heavy processing in your application like image processing or something, you might want to decrease spark.memory.fraction to give more to user memory.<\/p>\n<blockquote>\n<p class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><span style=\"font-size: 16px;\">Good reads<\/span><\/p>\n<p><a href=\"https:\/\/spark.apache.org\/docs\/latest\/tuning.html#memory-management-overview\" data-href=\"https:\/\/spark.apache.org\/docs\/latest\/tuning.html#memory-management-overview\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">https:\/\/spark.apache.org\/docs\/latest\/tuning.html#memory-management-overview<\/a><\/p>\n<\/blockquote>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<h4 id=\"af7f\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\"><strong class=\"fm fy\">Dynamic resource allocation<\/strong><\/h4>\n<p id=\"5cb5\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">As you can see, configuring your cluster properly requires a bit of maths and if our workload is changing something elastic would be better. Dynamic resource allocation simplifies this for you. It adapts resources used in processing according to the workload. You can control this through configuration parameter<span>\u00a0<\/span><em class=\"gx\">spark.dynamicAllocation.enabled<span>\u00a0<\/span><\/em>and other parameters that allow you to set the initial, minimum and maximum number of executors.<\/p>\n<blockquote name=\"38f9\" class=\"graf graf--blockquote\">\n<p>Good reads<\/p>\n<p><a href=\"https:\/\/spark.apache.org\/docs\/latest\/job-scheduling.html#dynamic-resource-allocation\" data-href=\"https:\/\/spark.apache.org\/docs\/latest\/job-scheduling.html#dynamic-resource-allocation\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">https:\/\/spark.apache.org\/docs\/latest\/job-scheduling.html#dynamic-resource-allocation<\/a><\/p>\n<\/blockquote>\n<p class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gz ha hb hc hd he hf hg hh bo\">G<\/span><\/strong>C<\/span> Tuning<\/h3>\n<p id=\"adac\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Spark heavily relies on Java\u2019s memory management and garbage collection since it can store large objects in memory.<\/p>\n<p id=\"735c\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">The first step in GC tuning is to collect statistics on the frequency and execution times of the GC. This can be done passing<span>\u00a0<\/span><em class=\"gx\">-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps<\/em><span>\u00a0<\/span>as<span>\u00a0<\/span><em class=\"gx\">spark.executor.extraJavaOptions<span>\u00a0<\/span><\/em>and<span>\u00a0<\/span><em class=\"gx\">spark.driver.extraJavaOptions<\/em><span>\u00a0<\/span>in a job\u2019s configuration. Similarly, you can set the GC type like<span>\u00a0<\/span><em class=\"gx\">-XX:+UseG1GC<\/em><span>\u00a0<\/span>specifies that the G1GC garbage collector should be used (default is -XX:+UseParallelGC).<\/p>\n<p id=\"fa6c\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Spark has seen evolution in GC from the traditional memory management (heap space is divided into Young-Eden\/Survivor and Old generations) \u2014 Concurrent Mark Sweep (CMS) GC and ParallelOld GC, to G1 GC (heap space divided into regions) which aims to achieve both high throughput and low pause, to project Tungsten available with higher-level APIs which aims to reduce GC by exploiting the schema knowledge of data to layout memory explicitly.<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<blockquote>\n<p>Good reads<\/p>\n<p><a href=\"https:\/\/databricks.com\/blog\/2015\/05\/28\/tuning-java-garbage-collection-for-spark-applications.html\" data-href=\"https:\/\/databricks.com\/blog\/2015\/05\/28\/tuning-java-garbage-collection-for-spark-applications.html\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">https:\/\/databricks.com\/blog\/2015\/05\/28\/tuning-java-garbage-collection-for-spark-applications.html<\/a><\/p>\n<p><a href=\"https:\/\/aws.amazon.com\/blogs\/big-data\/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr\/\" data-href=\"https:\/\/aws.amazon.com\/blogs\/big-data\/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr\/\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">https:\/\/aws.amazon.com\/blogs\/big-data\/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr\/<\/a><\/p>\n<p><a href=\"http:\/\/saucam.github.io\/blog\/2015\/10\/14\/tuning-g1gc-spark\/.\" data-href=\"http:\/\/saucam.github.io\/blog\/2015\/10\/14\/tuning-g1gc-spark\/.\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">http:\/\/saucam.github.io\/blog\/2015\/10\/14\/tuning-g1gc-spark\/<\/a><\/p>\n<\/blockquote>\n<p><span>K. Shanmugam in his blog above compares memory usage with CMS GC and G1 GC through Ganglia as:<\/span><\/p>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/1_1XmHLArk_jlppGiF0yXN0Q.png&#8221; _builder_version=&#8221;3.27.4&#8243;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<p id=\"1af3\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">From<span>\u00a0<\/span><a href=\"https:\/\/www.oracle.com\/webfolder\/technetwork\/tutorials\/obe\/java\/G1GettingStarted\/index.html\" class=\"br de hx hy hz ia\" target=\"_blank\" rel=\"noopener noreferrer\">oracle<\/a>:<span>\u00a0<\/span><em class=\"gx\">In the case of G1 GC, the heap is partitioned into a set of equal-sized heap regions, each a contiguous range of virtual memory. Certain region sets are assigned the same roles (Eden, Survivor, Old) as in the older collectors, but there is not a fixed size for them. This provides greater flexibility in memory usage.<span>\u00a0<\/span><\/em>G1 tracks the liveliness of objects in the region (\u2018remembered set\u2019) and concentrates its collection and compaction activity on the areas of the heap that are likely to be full of reclaimable objects, that is, garbage. This makes pauses much controlled.<\/p>\n<p id=\"ab30\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">A problem with G1 GC is \u2018humongous objects\u2019 \u2014 any object that is more than half a region size is considered a humongous object. If you see back-to-back concurrent cycles initiated due to Humongous allocations a probable fix is to increase<span>\u00a0<\/span><em class=\"gx\">-XX:G1HeapRegionSize<\/em><span>\u00a0<\/span>such that previous Humongous objects are no longer Humongous and will follow the regular allocation. My colleague, Yash Datta, talks more about fine-tuning with G1 GC<span>\u00a0<\/span><a href=\"http:\/\/saucam.github.io\/blog\/2015\/10\/14\/tuning-g1gc-spark\/\" class=\"br de hx hy hz ia\" target=\"_blank\" rel=\"noopener noreferrer\">here<\/a><a href=\"http:\/\/saucam.github.io\/blog\/2015\/10\/14\/tuning-g1gc-spark\/\" class=\"br de hx hy hz ia\" target=\"_blank\" rel=\"noopener noreferrer\">.<\/a><\/p>\n<p id=\"ad48\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Until Java 1.8u40, the reclamation of humongous regions was only done during full GC events. This was fixed or improved in later releases<span>\u00a0<\/span><a href=\"https:\/\/bugs.openjdk.java.net\/browse\/JDK-8027959\" class=\"br de hx hy hz ia\" target=\"_blank\" rel=\"noopener noreferrer\">https:\/\/bugs.openjdk.java.net\/browse\/JDK-8027959<\/a><span>\u00a0<\/span>so the impact of the issue has been reduced significantly for newer JVMs.<\/p>\n<h3 id=\"61bc\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx gy\"><span style=\"font-size: x-large;\"><strong><span class=\"r gz ha hb hc hd he hf hg hh bo\">B<\/span><\/strong><\/span>ad code and related problems<\/h3>\n<p id=\"cd92\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">There are these brilliant slides \u2018<a href=\"https:\/\/www.slideshare.net\/databricks\/strata-sj-everyday-im-shuffling-tips-for-writing-better-spark-programs\" class=\"br de hx hy hz ia\" target=\"_blank\" rel=\"noopener noreferrer\"><em class=\"gx\">Everyday I\u2019m Shuffling<\/em><\/a><em class=\"gx\">\u2019<\/em><span>\u00a0<\/span>by Holden Karau and Vida Ha from Databricks which I guess every spark engineer must-read for tips to write better Spark programs. I am listing here the very common problems we faced which may or may not be covered here, skipping the details if they are already covered.<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; text_font=&#8221;||||||||&#8221; text_line_height=&#8221;2em&#8221; header_font=&#8221;||||||||&#8221; header_2_font=&#8221;Work Sans|700|||||||&#8221; header_2_font_size=&#8221;25px&#8221; header_2_line_height=&#8221;1.5em&#8221; max_width_tablet=&#8221;&#8221; max_width_phone=&#8221;&#8221; max_width_last_edited=&#8221;on|tablet&#8221; custom_padding=&#8221;||2px|||&#8221; header_2_font_size_tablet=&#8221;30px&#8221; header_2_font_size_phone=&#8221;15px&#8221; header_2_font_size_last_edited=&#8221;on|desktop&#8221; locked=&#8221;off&#8221; header_font_size__hover=&#8221;30px&#8221; header_font_size__hover_enabled=&#8221;30px&#8221; header_letter_spacing__hover=&#8221;0px&#8221; header_letter_spacing__hover_enabled=&#8221;0px&#8221; header_text_shadow_style__hover=&#8221;none&#8221; header_text_shadow_style__hover_enabled=&#8221;none&#8221; header_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_2_font_size__hover=&#8221;26px&#8221; header_2_font_size__hover_enabled=&#8221;26px&#8221; header_2_letter_spacing__hover=&#8221;0px&#8221; header_2_letter_spacing__hover_enabled=&#8221;0px&#8221; header_2_line_height__hover=&#8221;1em&#8221; header_2_line_height__hover_enabled=&#8221;1em&#8221; header_2_text_shadow_style__hover=&#8221;none&#8221; header_2_text_shadow_style__hover_enabled=&#8221;none&#8221; header_2_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_2_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_3_font_size__hover=&#8221;22px&#8221; header_3_font_size__hover_enabled=&#8221;22px&#8221; header_3_letter_spacing__hover=&#8221;0px&#8221; header_3_letter_spacing__hover_enabled=&#8221;0px&#8221; header_3_line_height__hover=&#8221;1em&#8221; header_3_line_height__hover_enabled=&#8221;1em&#8221; header_3_text_shadow_style__hover=&#8221;none&#8221; header_3_text_shadow_style__hover_enabled=&#8221;none&#8221; header_3_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_3_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_4_font_size__hover=&#8221;18px&#8221; header_4_font_size__hover_enabled=&#8221;18px&#8221; header_4_letter_spacing__hover=&#8221;0px&#8221; header_4_letter_spacing__hover_enabled=&#8221;0px&#8221; header_4_line_height__hover=&#8221;1em&#8221; header_4_line_height__hover_enabled=&#8221;1em&#8221; header_4_text_shadow_style__hover=&#8221;none&#8221; header_4_text_shadow_style__hover_enabled=&#8221;none&#8221; header_4_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_4_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_5_font_size__hover=&#8221;16px&#8221; header_5_font_size__hover_enabled=&#8221;16px&#8221; header_5_letter_spacing__hover=&#8221;0px&#8221; header_5_letter_spacing__hover_enabled=&#8221;0px&#8221; header_5_line_height__hover=&#8221;1em&#8221; header_5_line_height__hover_enabled=&#8221;1em&#8221; header_5_text_shadow_style__hover=&#8221;none&#8221; header_5_text_shadow_style__hover_enabled=&#8221;none&#8221; header_5_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_5_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; header_6_font_size__hover=&#8221;14px&#8221; header_6_font_size__hover_enabled=&#8221;14px&#8221; header_6_letter_spacing__hover=&#8221;0px&#8221; header_6_letter_spacing__hover_enabled=&#8221;0px&#8221; header_6_line_height__hover=&#8221;1em&#8221; header_6_line_height__hover_enabled=&#8221;1em&#8221; header_6_text_shadow_style__hover=&#8221;none&#8221; header_6_text_shadow_style__hover_enabled=&#8221;none&#8221; header_6_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; header_6_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221; text_letter_spacing__hover=&#8221;0px&#8221; text_letter_spacing__hover_enabled=&#8221;0px&#8221; text_text_shadow_style__hover=&#8221;none&#8221; text_text_shadow_style__hover_enabled=&#8221;none&#8221; text_text_shadow_color__hover=&#8221;rgba(0,0,0,0.4)&#8221; text_text_shadow_color__hover_enabled=&#8221;rgba(0,0,0,0.4)&#8221;]<blockquote name=\"0ac7\" class=\"graf graf--blockquote\">\n<p>Good read:\u00a0<a href=\"https:\/\/www.slideshare.net\/databricks\/strata-sj-everyday-im-shuffling-tips-for-writing-better-spark-programs\" data-href=\"https:\/\/www.slideshare.net\/databricks\/strata-sj-everyday-im-shuffling-tips-for-writing-better-spark-programs\" class=\"markup--anchor markup--blockquote-anchor\" rel=\"noopener noreferrer\" target=\"_blank\">https:\/\/www.slideshare.net\/databricks\/strata-sj-everyday-im-shuffling-tips-for-writing-better-spark-programs<\/a><\/p>\n<\/blockquote>\n<p id=\"ce19\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><strong class=\"fm fy\"><em class=\"gx\">Avoid shuffle (we know this but we still do!)<\/em><\/strong><\/p>\n<p id=\"2943\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Use built-in functions and avoid unnecessary collect. collect() sends all the partitions to the single driver which can cause OOM errors.<\/p>\n<p id=\"f151\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><strong class=\"fm fy\"><em class=\"gx\">Some preferred methods: ReduceByKey over GroupByKey<\/em><\/strong><\/p>\n<p id=\"caea\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">As ReduceByKey combines data before shuffling, therefore minimizing the amount of data transferred over the network compared to GroupByKey. GroupByKey most of the time causes of out of disk problems<em class=\"gx\">.<\/em><\/p>\n<p id=\"e9cc\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\"><strong class=\"fm fy\"><em class=\"gx\">Avoid serialization of the whole object<\/em><\/strong><\/p>\n<p id=\"4959\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">Avoid serialization of the whole object due to references in closures passed to transformation functions (map, flatMap, filter, etc) or while broadcasting a variable. When you apply such transformations, your transformation code or closure is:<\/p>\n<ol class=\"\">\n<li id=\"cce4\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx ih hj hk\" data-selectable-paragraph=\"\">Serialized on the driver node<\/li>\n<li id=\"4c2b\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx ih hj hk\" data-selectable-paragraph=\"\">Shipped over the network to the worker nodes<\/li>\n<li id=\"305e\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx ih hj hk\" data-selectable-paragraph=\"\">Deserialized<\/li>\n<li id=\"4a66\" class=\"fk fl co au fm b fn hl fp hm fr hn ft ho fv hp fx ih hj hk\" data-selectable-paragraph=\"\">Executed on each partition independently<\/li>\n<\/ol>\n<p id=\"45d3\" class=\"fk fl co au fm b fn fo fp fq fr fs ft fu fv fw fx\" data-selectable-paragraph=\"\">For example, the code below will fail since it will try to serialize Executor (which is not Serializable), it would throw<span>\u00a0<\/span><em class=\"gx\">SparkException: <strong>Task not serializable Caused by<\/strong><\/em><strong>\u00a0<em class=\"gx\">java.io.NotSerializableException<\/em><\/strong>.<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<pre class=\"jt ju jv jw jx lo lp ds\"><span id=\"d686\" class=\"lq ga db bo lr b fj ls lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">val <\/strong><em class=\"ke\">rdd<\/em>: RDD[String] = \u2026\n<\/span><span id=\"d840\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">object <\/strong>Executor {<\/span><span id=\"0805\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\"> <br \/> def <\/strong>bc = sc.broadcast(map)<\/span><span id=\"1b61\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><em class=\"ke\">  <br \/> rdd<\/em>.map(bc.value.get)\n<\/span><span id=\"86b6\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">}<\/span><\/pre>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p>Another common example:\u00a0<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<pre class=\"jt ju jv jw jx lo lp ds\"><span id=\"45ed\" class=\"lq ga db bo lr b fj ls lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">object <\/strong>Executor {<\/span><span id=\"d3a4\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\"> <br \/> def <\/strong>f (log: String): String = log.stripSuffix(<strong class=\"lr gs\">\u201c\u201d<\/strong>)<\/span><span id=\"f63b\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\"> <br \/> val <\/strong><em class=\"ke\">sample <\/em>= rdd.map(<em class=\"ke\">f<\/em>(_))<br \/><\/span><span id=\"4a2e\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">}<\/span><\/pre>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p id=\"3a55\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">To fix this, you would tend to make your class Serializable. It would work but may not be desirable since you want to be serializing as little as possible.<strong class=\"gg gs\"><span>\u00a0<\/span><\/strong>A better solution could be:<\/p>\n<p class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\"><\/p>\n<ol class=\"\">\n<li id=\"d55c\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr ln kq kr\" data-selectable-paragraph=\"\">Use your function as a first-class citizen if it is possible.<\/li>\n<\/ol>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243; custom_padding=&#8221;0px|||||&#8221;]<pre class=\"jt ju jv jw jx lo lp ds\"><span id=\"3267\" class=\"lq ga db bo lr b fj ls lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">object <\/strong>Executor {<\/span><span id=\"0f4e\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><em class=\"ke\"> <br \/> rdd<\/em>.map(_.stripSuffix(<strong class=\"lr gs\">\u201c\u201d<\/strong>))<br \/><\/span><span id=\"c4c3\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">}<\/span><\/pre>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p><span>2. Instead of def, use\u00a0<\/span><em class=\"ke\">val<\/em><span>\u00a0function and enclose it in a function or block so it wouldn\u2019t need to serialize the whole object because it can access everything required the scope of the block.<\/span><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<pre class=\"jt ju jv jw jx lo lp ds\"><span id=\"7c13\" class=\"lq ga db bo lr b fj ls lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">object <\/strong>Executor {<\/span><span id=\"c464\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\"> <br \/> val <\/strong><em class=\"ke\">xyz <\/em>= \u2026<\/span><span id=\"28a6\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">  <br \/> val <\/strong><em class=\"ke\">block <\/em>= {<\/span><span id=\"9b4b\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\"> <br \/>   val <\/strong>f: String =&gt; String = (log: String) =&gt; log.stripSuffix(<strong class=\"lr gs\">\u201c\u201d<\/strong>)<\/span><span id=\"9a4d\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><em class=\"ke\"> <br \/>   rdd<\/em>.map(f(_))<\/span><span id=\"7c34\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">  }<br \/><\/span><span id=\"806e\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">}<\/span><\/pre>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p><span>3. Put it into a companion object which is Serializable.<\/span><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243; custom_padding=&#8221;||0px|||&#8221;]<pre class=\"jt ju jv jw jx lo lp ds\"><span id=\"d40e\" class=\"lq ga db bo lr b fj ls lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">class <\/strong>Executor {<\/span><span id=\"1d50\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\"> <br \/> val <\/strong><em class=\"ke\">xyz <\/em>= \u2026<br \/><\/span><span id=\"53c0\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">.. }<br \/><\/span><span id=\"1657\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">object <\/strong>Executor <strong class=\"lr gs\">extends <\/strong>java.io.Serializable {<\/span><span id=\"4b0c\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\"> <br \/> def <\/strong>f (log: String): String = log.stripSuffix(<strong class=\"lr gs\">\u201c\u201d<\/strong>)<br \/><\/span><span id=\"c27d\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">}<br \/><\/span><span id=\"5b4b\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">val <\/strong><em class=\"ke\">block <\/em>= rawRDD.map(<em class=\"ke\">f<\/em>(_))<\/span><\/pre>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p id=\"db2c\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\"><strong class=\"gg gs\"><em class=\"ke\">Operations reordering<\/em><\/strong><\/p>\n<p id=\"6907\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">If you are using RDD, then the reordering of operations can improve speed. However, if you are using higher-level APIs, the catalyst query optimizer automatically optimizes.<\/p>\n<p id=\"9f38\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\"><strong class=\"gg gs\"><em class=\"ke\">Improve Joins<\/em><\/strong><\/p>\n<ul class=\"\">\n<li id=\"6db2\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kp kq kr\" data-selectable-paragraph=\"\"><em class=\"ke\">Using Broadcast variable (Joining very large datasets with a relatively small dataset)<\/em><\/li>\n<\/ul>\n<p id=\"1e69\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Taking an example from the slides above: Join the dataset \u2018people_in_the_us\u2019 (large RDD) with \u2018states\u2019 (small RDD). There are only 50 keys in \u2018state\u2019, all the people data is shuffled to only these 50 keys (first figure) leading to uneven sharding and limited parallelism. Even larger clusters won\u2019t solve this problem. Instead, if you broadcast the \u2018state\u2019 dataset to all the workers, you can avoid these problems.<\/p>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/1_-pOTQhp9LbEsGfmJa5JDYA.png&#8221; _builder_version=&#8221;3.27.4&#8243;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p><span>If you are using RDD, you have to explicitly broadcast, for example:\u00a0<\/span><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<pre class=\"jt ju jv jw jx lo lp ds\"><span id=\"ec78\" class=\"lq ga db bo lr b fj ls lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">val <\/strong>bc = sc.broadcast(smallRdd.keyBy(_.id).collect.toMap<br \/><\/span><span id=\"47cf\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">largeRdd.map (bc.value.get(_.id)) <br \/><\/span><\/pre>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p><span>If you are using dataframe, broadcast joins are done automatically. Config parameter\u00a0<\/span><em class=\"ke\">spark.sql.autoBroadcastJoinThreshold<\/em><span>\u00a0and a broadcast hint is used used to control broadcast and therefore affects the performance of your job.<\/span><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<pre class=\"jt ju jv jw jx lo lp ds\"><span id=\"dd27\" class=\"lq ga db bo lr b fj ls lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">val <\/strong>df1 = spark.range(1000)<br \/><\/span><span id=\"5345\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><strong class=\"lr gs\">val <\/strong>df2 = spark.range(1000)<br \/><\/span><span id=\"8d82\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">df1.join(df2, Seq(<strong class=\"lr gs\">\u201cid\u201d<\/strong>)).explain<br \/><\/span><span id=\"ee13\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">== Physical Plan ==<br \/><\/span><span id=\"b2f2\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">*(2) Project [id#0L]<\/span><span id=\"3d12\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\"><br \/>+- *(2) BroadcastHashJoin [id#0L], [id#2L], Inner, BuildRight<br \/><\/span><span id=\"702a\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">:- *(2) Range (0, 1000, step=1, splits=8)<br \/><\/span><span id=\"69a0\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]))<br \/><\/span><span id=\"0551\" class=\"lq ga db bo lr b fj lv lw lx ly lz lt y lu\" data-selectable-paragraph=\"\">+- *(1) Range (0, 1000, step=1, splits=8)<\/span><\/pre>\n<p>&nbsp;<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p id=\"5464\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">A common exception related to broadcast joins is<span>\u00a0<\/span><strong><em class=\"ke\">java.util.concurrent.TimeoutException: Futures timed out after [300 seconds]<\/em><\/strong><\/p>\n<p id=\"b775\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">This could be because of the large size of the dataset being broadcasted and taking too much time because of the size. You can try disabling broadcast by setting<span>\u00a0<\/span><em class=\"ke\">spark.sql.autoBroadcastJoinThreshold<\/em><span>\u00a0<\/span>to -1 or increasing the broadcast time through spark.sql.broadcastTimeout (defaults to 5 minutes).<\/p>\n<p id=\"e8dd\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Try it out yourself! Re-run the above example with:<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<p>\u00a0<span style=\"font-size: 16px; font-family: 'Courier New', monospace;\">spark.conf.set(\u201cspark.sql.autoBroadcastJoinThreshold\u201d, -1)<\/span><\/p>\n<p><span style=\"font-size: 16px; font-family: 'Courier New', monospace;\">df1.join(df2, Seq(<\/span><strong class=\"lr gs\" style=\"font-size: 16px; font-family: 'Courier New', monospace;\">\u201cid\u201d<\/strong><span style=\"font-size: 16px; font-family: 'Courier New', monospace;\">)).explain<\/span><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<ul class=\"\">\n<li id=\"c5dc\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kp kq kr\" data-selectable-paragraph=\"\"><em class=\"ke\">Using filters pre-join (Joining very large datasets with a mid-sized dataset)<\/em><\/li>\n<\/ul>\n<p id=\"ff4c\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Joining the dataset \u2018people_in_the_us\u2019 (large RDD) with \u2018people_in_california\u2019 (mid-sized RDD) will shuffle everything! This can be optimized by the trick to filter out the rows from California in \u2018people_in_the_us\u2019 before the join, to reduce the size of data shuffled.<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<h3 class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kf\"><span style=\"font-size: x-large;\"><strong><span class=\"y kg kh ki kj kk kl km kn ko hb\"><\/span><\/strong><\/span><\/h3>\n<h3 id=\"0583\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kf\"><span style=\"font-size: x-large;\"><strong><span class=\"y kg kh ki kj kk kl km kn ko hb\">D<\/span><\/strong><\/span>egree of parallelism<\/h3>\n<p id=\"9bc9\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">To tune your job to use the cluster fully, it is important to understand how the data gets distributed over the cluster or how many partitions does an RDD represents.<\/p>\n<p id=\"bc8a\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Spark manages data through partitions which helps in parallelizing the distributed data processing with minimal network traffic for sending data between the executors.<\/p>\n<p id=\"064c\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\"><em class=\"ke\">Spark can create distributed datasets from any storage source supported by Hadoop, including your local file system, HDFS, Cassandra, HBase,<span>\u00a0<\/span><\/em><a href=\"http:\/\/wiki.apache.org\/hadoop\/AmazonS3\" class=\"ax cj ld le lf lg\" target=\"_blank\" rel=\"noopener nofollow noreferrer\"><em class=\"ke\">Amazon S3<\/em><\/a><em class=\"ke\">, etc. Spark supports text files,<span>\u00a0<\/span><\/em><a href=\"http:\/\/hadoop.apache.org\/common\/docs\/current\/api\/org\/apache\/hadoop\/mapred\/SequenceFileInputFormat.html\" class=\"ax cj ld le lf lg\" target=\"_blank\" rel=\"noopener nofollow noreferrer\"><em class=\"ke\">SequenceFiles<\/em><\/a><em class=\"ke\">, and any other Hadoop<span>\u00a0<\/span><\/em><a href=\"http:\/\/hadoop.apache.org\/docs\/stable\/api\/org\/apache\/hadoop\/mapred\/InputFormat.html\" class=\"ax cj ld le lf lg\" target=\"_blank\" rel=\"noopener nofollow noreferrer\"><em class=\"ke\">InputFormat<\/em><\/a><em class=\"ke\">.<\/em><span>\u00a0<\/span>How Spark would distribute data read from these storage depends on the partitioning scheme in these storage and locality.<\/p>\n<p id=\"b0df\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">For example, if you are reading data from HDFS, by default, a partition is created for each HDFS block, which (by default) is 64MB. While if you run Spark in local mode, by default the number of partitions is the number of available cores. Also, the spark connectors for these databases would take locality into account. For example, if Spark and Cassandra are on the same physical machine, the<span>\u00a0<\/span><a href=\"https:\/\/github.com\/datastax\/spark-cassandra-connector\" class=\"ax cj ld le lf lg\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">spark-cassandra-connector<\/a><span>\u00a0<\/span>will ensure data locality for both reads and writes. If you load a Cassandra table into an RDD, the connector will always try to do the operations on this RDD locally on each node and when you save the RDD into Cassandra, the connector will also try to save results locally.<\/p>\n<p id=\"1f9a\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">RDDs get partitioned automatically, without programmer\u2019s intervention. However, there are times when you would like to adjust the size and number of partitions or the partitioning scheme according to the needs of your application.<\/p>\n<ul class=\"\">\n<li id=\"8d92\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kp kq kr\" data-selectable-paragraph=\"\"><strong class=\"gg gs\">Repartition:<span>\u00a0<\/span><\/strong>is used when the default number of partitions is too less and you want to increase or data is inconsistently distributed. Partition would perform a full shuffle. You can pass the \u2018number of partitions\u2019 as a parameter while reading the data or use repartition(n). Factors to consider to estimate the ideal number of partitions:<\/li>\n<\/ul>\n<p id=\"247b\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Cores: A rough estimate for the number of partitions could be 2 x (number executors x numbers of cores per executor). You can start from here and try finding the optimal number of partitions.<\/p>\n<p id=\"a5ce\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Task input size: You should check how much is the task input size, normally it should ideally be 128 MB. If it is bigger, you might want to experiment by adding more partitions.<\/p>\n<ul class=\"\">\n<li id=\"6813\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kp kq kr\" data-selectable-paragraph=\"\"><strong class=\"gg gs\">Coalesce:<\/strong><span>\u00a0<\/span>is used when you want to decrease the number of partitions with minimal shuffle. Often use while writing data to databases to reduce the parallelism. coalesce(n, flag) accepts \u2018number of partitions\u2019 and \u2018shuffle\u2019 boolean parameter. repartition(n) calls coalesce(n, flag=true).<\/li>\n<\/ul>\n<p id=\"4f16\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">One important point to take note of while coalescing is that it affects the upstream processing parallelism in case you are avoiding a shuffle by using coalesce. This means that<span>\u00a0<\/span><strong class=\"gg gs\">rdd.map(f).coalesce(5)<\/strong><span>\u00a0<\/span>will force the processing of map over f to just 5 tasks even if the RDD had 100 partitions before.<\/p>\n<p id=\"d7b2\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Tuning partitions is one of the most common methods of optimizing Spark jobs.<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<h3 class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kf\"><span style=\"font-size: x-large;\"><strong><span class=\"y kg kh ki kj kk kl km kn ko hb\"><\/span><\/strong><\/span><\/h3>\n<h3 id=\"009b\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kf\"><span style=\"font-size: x-large;\"><strong><span class=\"y kg kh ki kj kk kl km kn ko hb\">M<\/span><\/strong><\/span>ore on partitioners<\/h3>\n<p id=\"36cd\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Spark provides two inbuilt partitioners:<\/p>\n<ul class=\"\">\n<li id=\"f9a5\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr kp kq kr\" data-selectable-paragraph=\"\">HashPartitioner: Object.hashCode method is used to determine the partition in Spark as<span>\u00a0<\/span><code class=\"hc mc md me lr b\">partition = key.hashCode % numPartitions<\/code>.<\/li>\n<li id=\"0541\" class=\"ge gf db bo gg b gh ks gj kt jm ku jo kv jq kw gr kp kq kr\" data-selectable-paragraph=\"\">RangePartitioner: If keys in your data follow a particular ordering, range partitioning is an efficient partitioning technique. In the range partitioning method, tuples having keys within the same range will appear on the same machine.<\/li>\n<\/ul>\n<p id=\"90fc\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">Spark also allows you to write your CustomPartitioner and pass it through<span>\u00a0<\/span><code class=\"hc mc md me lr b\">partitionBy(partitioner).persist()<\/code>(if you noticed persist here, it is because if you are partitioning your data, you should persist it to avoid accidental repartitioning of data). If you are doing so, one important thing to look for is that as much as possible you use the operations that propagate the partitioner, in other words, that cannot change the key.<\/p>\n<p id=\"4f6e\" class=\"ge gf db bo gg b gh jk gj jl jm jn jo jp jq jr gr\" data-selectable-paragraph=\"\">For example map() vs mapValues() -if you applied any custom partitioning to your RDD, using<span>\u00a0<\/span><code class=\"hc mc md me lr b\">map<\/code><span>\u00a0<\/span>could lose that partitioner (the result will revert to default partitioning) as the keys might have changed.<span>\u00a0<\/span><code class=\"hc mc md me lr b\">mapValues<\/code>, however, would preserve any partitioner set on the RDD. Operations on RDDs that propagates a partitioner are:<\/p>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/1_xgcbRtKCKn6eG0QZogCDqg.png&#8221; align=&#8221;center&#8221; admin_label=&#8221;operations on RDDs that propagates a partitioner&#8221; _builder_version=&#8221;3.27.4&#8243; height=&#8221;162px&#8221; max_height=&#8221;570px&#8221;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<h6 style=\"text-align: center;\"><span>Operations on RDDs that propagates a partitioner<\/span><\/h6>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; custom_padding=&#8221;0px|||||&#8221;]<blockquote class=\"gc gd ge\">\n<p id=\"2f6f\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Good watch:<\/p>\n<p class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><span><a href=\"https:\/\/www.coursera.org\/lecture\/scala-spark-big-data\/partitioning-Vkhm0?source=post_page-----47b98ccb2b2c----------------------\">excerpt on partitioning by Coursera<\/a><\/span>\u00a0<\/p>\n<\/blockquote>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; custom_padding=&#8221;0px||0px|||&#8221;]<blockquote class=\"gc gd ge\">\n<p class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><\/p>\n<\/blockquote>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<h3 id=\"09fa\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gg\"><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\">C<\/span><\/strong><\/span>hoice of Serializer<\/h3>\n<p id=\"d169\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Serialization is sometimes a bottleneck when shuffling and caching data.<\/p>\n<ul class=\"\">\n<li id=\"6acb\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gq gr gs\" data-selectable-paragraph=\"\"><em class=\"gf\">By default, Spark serializes objects using Java\u2019s ObjectOutputStream framework, and can work with any class you create that implements<span>\u00a0<\/span><\/em><a href=\"https:\/\/docs.oracle.com\/javase\/8\/docs\/api\/java\/io\/Serializable.html\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\"><em class=\"gf\">java.io.Serializable<\/em><\/a><em class=\"gf\">.<\/em><\/li>\n<li id=\"8220\" class=\"eq er dc bk es b et gt ev gu ex gv ez gw fb gx fd gq gr gs\" data-selectable-paragraph=\"\"><em class=\"gf\">Kryo is significantly faster and more compact than Java serialization (often as much as 10x)<\/em>. Kryo requires you to register every class you are going to use, this lets Kyro avoid to write the name of the class along with every object which is one of the largest overhead in serialization.<\/li>\n<\/ul>\n<p id=\"6ad0\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">However, Datasets&#8217; encoders are considered to be much faster compared to serialization\/deserialization via Java\/Kryo.<\/p>\n<blockquote class=\"gc gd ge\">\n<p id=\"2f6f\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Good reads:<\/p>\n<p id=\"5518\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Apache Spark Datasets<span>\u00a0<\/span><a href=\"https:\/\/databricks.com\/blog\/2016\/01\/04\/introducing-apache-spark-datasets.html\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">https:\/\/databricks.com\/blog\/2016\/01\/04\/introducing-apache-spark-datasets.html<\/a><\/p>\n<p id=\"ceba\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Tuning Spark<span>\u00a0<\/span><a href=\"https:\/\/spark.apache.org\/docs\/latest\/tuning.html#data-serialization\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">https:\/\/spark.apache.org\/docs\/latest\/tuning.html#data-serialization<\/a><\/p>\n<\/blockquote>\n<h3 class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gg\"><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\"><\/span><\/strong><\/span><\/h3>\n<h3 id=\"e52c\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gg\"><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\">D<\/span><\/strong><\/span>edicated Spark local directories<\/h3>\n<p id=\"502b\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Spark writes intermediate output files from map tasks, shuffles and data that gets stored to disk to a temporary directory or \u2018scratch\u2019 space. It is set through spark.local.dir or yarn.nodemanager.local-dirs in yarn mode, which by default is \/tmp. \/tmp is used by the operating system so it might get filled up very quickly which can lead to exceptions like<span>\u00a0<\/span><em class=\"gf\"><strong>java.io.IOException: No space left on device<\/strong>.<\/em><\/p>\n<p id=\"e45c\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">In such cases, if you may notice that the \u2018Shuffle write\u2019 is huge, meaning there is a lot of movement in data, implying \u2018scratch\u2019 might be used heavily, can get above exception. Increasing the scratch space might fix your problem.<\/p>\n<p id=\"8d28\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Spark recommends this directory to be fast, local and if possible dedicated. You can also pass a list of directories to this parameter.<\/p>\n<p id=\"c1bc\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">EMR provides a simple interface to add volumes to nodes in your cluster. For example, here I have added two volumes to my core instance:<\/p>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/0_0szGtPfbawZrLm3v.png&#8221; _builder_version=&#8221;3.27.4&#8243;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p>Mounted as:<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<p>\u00a0<span style=\"font-size: 16px; font-family: 'Courier New', monospace;\">Filesystem Size Used Avail Use% Mounted on<\/span><\/p>\n<pre class=\"fg fh fi fj fk hq hr hs\"><span id=\"7a55\" class=\"ht hu dc bk hv b du hz ia ib ic id hx r hy\" data-selectable-paragraph=\"\">\/dev\/xvdb2 95G 857M 95G 1% \/mnt\n<\/span><span id=\"572c\" class=\"ht hu dc bk hv b du hz ia ib ic id hx r hy\" data-selectable-paragraph=\"\">\/dev\/xvdc 90G 425M 90G 1% \/mnt1<\/span>\u00a0<\/pre>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p id=\"79eb\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">If you check yarn-site.xml, you will see \u2018yarn.nodemanager.local-dirs\u2019 is already set to \u2018\/mnt\/yarn,\/mnt1\/yarn\u2019.<\/p>\n<h3 class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gg\"><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\"><\/span><\/strong><\/span><\/h3>\n<h3 id=\"ecc5\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gg\"><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\">W<\/span><\/strong><\/span>riting to databases<\/h3>\n<p id=\"e45d\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Normally you would initialize the Database Connection on the Worker rather than the Driver since network sockets are non-serializable. When persisting an RDD to a database, connections should be initiated on each partition (foreachPartition) rather than in the driver or for each entry (which would fail due to too many connections or extremely slow).<\/p>\n<p id=\"74b8\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Your connector should be writing in batches, you can tune the parameters related to it like batch size, the number of parallel writes, etc.<\/p>\n<p id=\"d7e0\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Prefer the connectors which are \u2018locality aware\u2019 so it can reduce the data movement based on the placement of your Database and Spark instances<\/p>\n<h3 class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gg\"><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\"><\/span><\/strong><\/span><\/h3>\n<h3 id=\"acfa\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gg\"><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\">D<\/span><\/strong><\/span>ata storage formats &amp; compression options<\/h3>\n<p class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><strong class=\"es fe\"><\/strong><\/p>\n<p id=\"a667\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><strong class=\"es fe\">Parquet<\/strong><\/p>\n<p id=\"75a9\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><a href=\"http:\/\/parquet.io\/\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">Parquet<\/a><span>\u00a0<\/span>is a columnar format that automatically preserves the schema of the original data. Also, this format is supported by many other data processing systems. It can save both time and space since it enables the jobs to read a smaller fraction of the data which is required for the calculation<\/p>\n<p id=\"44cc\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">due to push-down filters and default compression, snappy, it comes with. Apart from compression, parquet uses many different encodings for different data types which you can read about<span>\u00a0<\/span><a href=\"https:\/\/github.com\/apache\/parquet-format\/blob\/master\/Encodings.md\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">here<\/a><\/p>\n<p id=\"8ea7\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Since Parquet files require a schema, if you are using RDDs, you would normally convert your RDD to Dataframe. RDDs are not required to have schema but Dataframes must-have.<\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243; background_color=&#8221;#d6d6d6&#8243;]<p>\u00a0<strong class=\"hv fe\" style=\"font-size: 16px; font-family: 'Courier New', monospace;\">val<\/strong><span style=\"font-size: 16px; font-family: 'Courier New', monospace;\"> df = rdd.toDF() \/\/ convert RDD to Dataframe<\/span><\/p>\n<p><span style=\"font-size: 16px; font-family: 'Courier New', monospace;\">df.write.parquet(&lt;filepath&gt;) \/\/ save as parquet<\/span><span style=\"font-size: 16px;\">\u00a0<\/span><\/p>[\/et_pb_text][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><strong>ORC<\/strong><strong class=\"es fe\"><\/strong><\/p>\n<p class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">ORC is also a columnar like Parquet but in some cases, also as per our benchmarks with Presto, can outperform Parquet. Here is a comparison by <a href=\"https:\/\/www.datanami.com\/2018\/05\/16\/big-data-file-formats-demystified\/\">Nexla<\/a> of common data formats.<\/p>\n<p class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/Nexla-File-Format.png\" width=\"511\" height=\"433\" alt=\"\" class=\"wp-image-28821 alignnone size-full\" style=\"display: block; margin-left: auto; margin-right: auto;\" srcset=\"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/Nexla-File-Format.png 511w, https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/Nexla-File-Format-480x407.png 480w\" sizes=\"auto, (min-width: 0px) and (max-width: 480px) 480px, (min-width: 481px) 511px, 100vw\" \/><\/p>\n<p class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><strong><\/strong><\/p>\n<p id=\"f743\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><strong class=\"es fe\">Databricks Delta<\/strong><\/p>\n<p id=\"cd08\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">We are still exploring this but<span>\u00a0<\/span><a href=\"https:\/\/docs.databricks.com\/delta\/delta-intro.html\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">Delta<\/a><span>\u00a0<\/span>looks very promising especially for ML workloads since over the usual Parquet format, it adds transactional properties to your data and makes versioning and rollbacks possible, this allows you to reproduce your ML experiments. It can be called as \u2018super parquet\u2019 format.<\/p>\n<p id=\"86d6\" class=\"fl fm bz at fn b fo fp fq fr fs ft fu fv fw fx fy\" data-selectable-paragraph=\"\"><strong class=\"fn fz\">Compression Options<\/strong><\/p>\n<p id=\"bf1b\" class=\"fl fm bz at fn b fo fp fq fr fs ft fu fv fw fx fy\" data-selectable-paragraph=\"\">This slide from Yahoo! nicely summarises the popular compressions.<\/p>\n<p class=\"fl fm bz at fn b fo fp fq fr fs ft fu fv fw fx fy\" data-selectable-paragraph=\"\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/compression-options-in-hadoop-a-tale-of-tradeoffs-8-638.jpg\" width=\"555\" height=\"417\" alt=\"\" class=\"wp-image-28827 alignnone size-full\" style=\"display: block; margin-left: auto; margin-right: auto;\" srcset=\"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/compression-options-in-hadoop-a-tale-of-tradeoffs-8-638.jpg 555w, https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/compression-options-in-hadoop-a-tale-of-tradeoffs-8-638-480x360.jpg 480w\" sizes=\"auto, (min-width: 0px) and (max-width: 480px) 480px, (min-width: 481px) 555px, 100vw\" \/><\/p>\n<blockquote>\n<p class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Good read<\/p>\n<p class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Compression options -Yahoo! <a href=\"https:\/\/www.slideshare.net\/Hadoop_Summit\/singh-kamat-june27425pmroom210c\">-https:\/\/www.slideshare.net\/Hadoop_Summit\/singh-kamat-june27425pmroom210c<\/a><\/p>\n<\/blockquote>\n<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\"><\/span><\/strong><\/span><\/h3>\n<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\">C<\/span><\/strong><\/span>aching intermediate datasets<\/h3>\n<p id=\"7543\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Available levels:<\/p>\n<ul class=\"\">\n<li id=\"94d8\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd gq gr gs\" data-selectable-paragraph=\"\">MEMORY_ONLY (default) \u2014 deserialization, fast scanning (in runtime) but worse for GC pressure because you have random objects sitting around inside of cache.<\/li>\n<li id=\"870b\" class=\"eq er dc bk es b et gt ev gu ex gv ez gw fb gx fd gq gr gs\" data-selectable-paragraph=\"\">MEMORY_ONLY_SER stores as serialized Java objects (one byte array per partition). This can help cut down on GC, space-efficient than deserialized objects but more CPU-intensive to read.<\/li>\n<li id=\"2f96\" class=\"eq er dc bk es b et gt ev gu ex gv ez gw fb gx fd gq gr gs\" data-selectable-paragraph=\"\">MEMORY_AND_DISK avoids expensive recomputation. If the RDD does not fit in memory, at least it would be in the disk and you won\u2019t have to repeat the expensive computation.<\/li>\n<\/ul>[\/et_pb_text][et_pb_image src=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/0_SQpkgRaS3oHKnEyt.png&#8221; _builder_version=&#8221;3.27.4&#8243;][\/et_pb_image][et_pb_text _builder_version=&#8221;3.27.4&#8243;]<p id=\"41d9\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">To my surprise, this was the simplest and most effective method for us to gain performance. Some of our jobs ran six times faster after changing the persistence from default to MEMORY_AND_DISK_SER! That is because, in the case of MEMORY_ONLY caching if allowed storage memory is full, blocks are evicted for the newer blocks to be cached. But with MEMORY_AND_DISK, cached blocks are simply spilled over to disk without evicting any cached blocks.<\/p>\n<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\"><\/span><\/strong><\/span><\/h3>\n<h3><span style=\"font-size: x-large;\"><strong><span class=\"r gh gi gj gk gl gm gn go gp fo\">U<\/span><\/strong><\/span>se higher-level APIs<\/h3>\n<p id=\"b034\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">If your data is structured or semi-structured, it is better to use Dataframes\/Datasets which not only provides higher-level abstraction, richer semantics, type-safety at compile-time, columnar access etc but also better space and speed efficiency which is mainly due to following optimizations:<\/p>\n<p id=\"8c65\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><strong class=\"es fe\">Tungsten<\/strong>: which substantially improves the memory and CPU efficiency. It uses the knowledge of data schema to directly lay out the memory explicitly. As a result, Tungsten Encoders can efficiently serialize\/deserialize JVM objects as well as generate compact bytecode that can execute at superior speeds.<\/p>\n<p id=\"174b\" class=\"eq er dc bk es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><strong class=\"es fe\">Catalyst query optimizer<\/strong>: which can do optimizations like reordering of operations or reducing the amount of data that must be read for a calculation.<\/p>\n<blockquote class=\"gc gd ge\">\n<p id=\"39f0\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Good reads:<\/p>\n<p id=\"1655\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">A Tale of Three Apache Spark APIs<span>\u00a0<\/span><a href=\"https:\/\/databricks.com\/blog\/2016\/07\/14\/a-tale-of-three-apache-spark-apis-rdds-dataframes-and-datasets.html\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">https:\/\/databricks.com\/blog\/2016\/07\/14\/a-tale-of-three-apache-spark-apis-rdds-dataframes-and-datasets.html<\/a><\/p>\n<p id=\"ec1d\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Power of Datasets<span>\u00a0<\/span><a href=\"https:\/\/databricks.com\/blog\/2016\/01\/04\/introducing-apache-spark-datasets.html\" class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener nofollow noreferrer\">https:\/\/databricks.com\/blog\/2016\/01\/04\/introducing-apache-spark-datasets.html<\/a><\/p>\n<p id=\"6cf9\" class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\">Project Tungsten and Catalyst SQL optimizer<span>\u00a0<\/span><a class=\"at cg hf hg hh hi\" target=\"_blank\" rel=\"noopener noreferrer\" href=\"https:\/\/medium.com\/@goyalsaurabh66\/project-tungsten-and-catalyst-sql-optimizer-9d3c83806b63\">https:\/\/medium.com\/@goyalsaurabh66\/project-tungsten-and-catalyst-sql-optimizer-9d3c83806b63<\/a><\/p>\n<p class=\"eq er dc gf es b et eu ev ew ex ey ez fa fb fc fd\" data-selectable-paragraph=\"\"><\/p>\n<\/blockquote>\n<p id=\"425f\" class=\"fl fm bz at fn b fo fp fq fr fs ft fu fv fw fx fy\" data-selectable-paragraph=\"\">These are the basic concepts you must know to continuously optimize and scale your jobs on production, irrespective of whether you are working with RDDs or SparkSQL or higher-level APIs, which now does many optimizations under the hood and recommended but knowing fundamentals always helps. I still refer to these notes from time to time to refresh.<\/p>\n<p id=\"104b\" class=\"fl fm bz at fn b fo fp fq fr fs ft fu fv fw fx fy\" data-selectable-paragraph=\"\">If you happen to reach the end of this article to read this, I will be happy to hear your feedback.<\/p>[\/et_pb_text][\/et_pb_column_inner][\/et_pb_row_inner][et_pb_row_inner _builder_version=&#8221;4.0.6&#8243; custom_padding=&#8221;9px|||||&#8221; global_module=&#8221;29718&#8243; saved_tabs=&#8221;all&#8221;][et_pb_column_inner _builder_version=&#8221;4.0.6&#8243;][et_pb_blurb title=&#8221;Written By Sambodhi Khandelwal&#8221; image=&#8221;https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/04\/T02J6GGTV-U02JKUN3D-cf206e549b01-512.jpg&#8221; icon_placement=&#8221;left&#8221; image_max_width=&#8221;115%&#8221; content_max_width=&#8221;1100px&#8221; _builder_version=&#8221;4.0.6&#8243; hover_enabled=&#8221;0&#8243; border_radii_image=&#8221;off||||&#8221;]<p><span>CTO &amp; Co-Founder at Streamhub | Technology &amp; Startups<\/span><\/p>\n<p><span><span style=\"font-weight: 400;\">Sambodhi is the driving force behind Streamhub\u2019s technical innovation. Passionate about startup culture as well as solving problems by leveraging modern technologies, she has worked with some of the biggest names in online media including News International, Yahoo! and Thompson Reuters.<\/span><\/span><\/p>[\/et_pb_blurb][\/et_pb_column_inner][\/et_pb_row_inner][\/et_pb_column][et_pb_column type=&#8221;1_4&#8243; _builder_version=&#8221;3.25&#8243; custom_padding=&#8221;|||&#8221; parallax__hover=&#8221;off&#8221; parallax_method__hover=&#8221;on&#8221; custom_padding__hover=&#8221;|||&#8221;][\/et_pb_column][\/et_pb_section]\n","protected":false},"excerpt":{"rendered":"<p><span class=\"span-reading-time rt-reading-time\" style=\"display: block;\"><span class=\"rt-label rt-prefix\">Reading Time: <\/span> <span class=\"rt-time\"> 20<\/span> <span class=\"rt-label rt-postfix\">minutes<\/span><\/span><\/p>\n<p>Master Spark fundamentals &amp; optimizations\u00a0Streamhub&#8217;s all-in-one notes on how to optimize &amp; scale Spark jobs for something more than a POCAt Streamhub, we deal with many forms of data like user\u2019s (every minute) player viewership data, user\u2019s commerce data, user\u2019s subscription information, metadata feeds, metadata front-loaded, user panels, and several other third-party datasets. It is [&hellip;]<\/p>\n","protected":false},"author":6,"featured_media":28377,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_et_pb_use_builder":"on","_et_pb_old_content":"","_et_gb_content_width":"","content-type":"","_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":false,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"font":"","enabled":false},"version":2}},"categories":[13],"tags":[],"class_list":["post-28376","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-news"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v27.6 - https:\/\/yoast.com\/product\/yoast-seo-wordpress\/ -->\n<title>Apache Spark Tuning Manual - Streamhub.co.uk<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Apache Spark Tuning Manual - Streamhub.co.uk\" \/>\n<meta property=\"og:description\" content=\"Reading Time:  20 minutesMaster Spark fundamentals &amp; optimizations\u00a0Streamhub&#039;s all-in-one notes on how to optimize &amp; scale Spark jobs for something more than a POCAt Streamhub, we deal with many forms of data like user\u2019s (every minute) player viewership data, user\u2019s commerce data, user\u2019s subscription information, metadata feeds, metadata front-loaded, user panels, and several other third-party datasets. It is [&hellip;]\" \/>\n<meta property=\"og:url\" content=\"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/\" \/>\n<meta property=\"og:site_name\" content=\"Streamhub.co.uk\" \/>\n<meta property=\"article:published_time\" content=\"2019-11-01T12:54:22+00:00\" \/>\n<meta property=\"article:modified_time\" content=\"2020-06-30T05:09:26+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/ApacheSpark.png\" \/>\n\t<meta property=\"og:image:width\" content=\"320\" \/>\n\t<meta property=\"og:image:height\" content=\"136\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/png\" \/>\n<meta name=\"author\" content=\"Sambodhi\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Written by\" \/>\n\t<meta name=\"twitter:data1\" content=\"Sambodhi\" \/>\n\t<meta name=\"twitter:label2\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data2\" content=\"65 minutes\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\\\/\\\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/#article\",\"isPartOf\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/\"},\"author\":{\"name\":\"Sambodhi\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/person\\\/71fc449cd1d4677990e1942bee805c41\"},\"headline\":\"Apache Spark Tuning Manual\",\"datePublished\":\"2019-11-01T12:54:22+00:00\",\"dateModified\":\"2020-06-30T05:09:26+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/\"},\"wordCount\":12950,\"publisher\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#organization\"},\"image\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/#primaryimage\"},\"thumbnailUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2019\\\/11\\\/ApacheSpark.png\",\"articleSection\":[\"News\"],\"inLanguage\":\"en-US\"},{\"@type\":\"WebPage\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/\",\"name\":\"Apache Spark Tuning Manual - Streamhub.co.uk\",\"isPartOf\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/#primaryimage\"},\"image\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/#primaryimage\"},\"thumbnailUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2019\\\/11\\\/ApacheSpark.png\",\"datePublished\":\"2019-11-01T12:54:22+00:00\",\"dateModified\":\"2020-06-30T05:09:26+00:00\",\"breadcrumb\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/#primaryimage\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2019\\\/11\\\/ApacheSpark.png\",\"contentUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2019\\\/11\\\/ApacheSpark.png\",\"width\":320,\"height\":136},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/apache-spark-tuning-manual\\\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\\\/\\\/streamhub.co.uk\\\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Apache Spark Tuning Manual\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#website\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/\",\"name\":\"Streamhub.co.uk\",\"description\":\"Streamhub.co.uk\",\"publisher\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#organization\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\\\/\\\/streamhub.co.uk\\\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"en-US\"},{\"@type\":\"Organization\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#organization\",\"name\":\"Streamhub.co.uk\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/\",\"logo\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/logo\\\/image\\\/\",\"url\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/05\\\/SH-Logo.png\",\"contentUrl\":\"https:\\\/\\\/streamhub.co.uk\\\/wp-content\\\/uploads\\\/2020\\\/05\\\/SH-Logo.png\",\"width\":1397,\"height\":361,\"caption\":\"Streamhub.co.uk\"},\"image\":{\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/logo\\\/image\\\/\"},\"sameAs\":[\"https:\\\/\\\/www.linkedin.com\\\/company\\\/3006156\\\/admin\\\/feed\\\/posts\\\/\"]},{\"@type\":\"Person\",\"@id\":\"https:\\\/\\\/streamhub.co.uk\\\/#\\\/schema\\\/person\\\/71fc449cd1d4677990e1942bee805c41\",\"name\":\"Sambodhi\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g\",\"url\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g\",\"contentUrl\":\"https:\\\/\\\/secure.gravatar.com\\\/avatar\\\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g\",\"caption\":\"Sambodhi\"},\"url\":false}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"Apache Spark Tuning Manual - Streamhub.co.uk","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/","og_locale":"en_US","og_type":"article","og_title":"Apache Spark Tuning Manual - Streamhub.co.uk","og_description":"Reading Time:  20 minutesMaster Spark fundamentals &amp; optimizations\u00a0Streamhub's all-in-one notes on how to optimize &amp; scale Spark jobs for something more than a POCAt Streamhub, we deal with many forms of data like user\u2019s (every minute) player viewership data, user\u2019s commerce data, user\u2019s subscription information, metadata feeds, metadata front-loaded, user panels, and several other third-party datasets. It is [&hellip;]","og_url":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/","og_site_name":"Streamhub.co.uk","article_published_time":"2019-11-01T12:54:22+00:00","article_modified_time":"2020-06-30T05:09:26+00:00","og_image":[{"width":320,"height":136,"url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/ApacheSpark.png","type":"image\/png"}],"author":"Sambodhi","twitter_card":"summary_large_image","twitter_misc":{"Written by":"Sambodhi","Est. reading time":"65 minutes"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/#article","isPartOf":{"@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/"},"author":{"name":"Sambodhi","@id":"https:\/\/streamhub.co.uk\/#\/schema\/person\/71fc449cd1d4677990e1942bee805c41"},"headline":"Apache Spark Tuning Manual","datePublished":"2019-11-01T12:54:22+00:00","dateModified":"2020-06-30T05:09:26+00:00","mainEntityOfPage":{"@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/"},"wordCount":12950,"publisher":{"@id":"https:\/\/streamhub.co.uk\/#organization"},"image":{"@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/#primaryimage"},"thumbnailUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/ApacheSpark.png","articleSection":["News"],"inLanguage":"en-US"},{"@type":"WebPage","@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/","url":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/","name":"Apache Spark Tuning Manual - Streamhub.co.uk","isPartOf":{"@id":"https:\/\/streamhub.co.uk\/#website"},"primaryImageOfPage":{"@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/#primaryimage"},"image":{"@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/#primaryimage"},"thumbnailUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/ApacheSpark.png","datePublished":"2019-11-01T12:54:22+00:00","dateModified":"2020-06-30T05:09:26+00:00","breadcrumb":{"@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/"]}]},{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/#primaryimage","url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/ApacheSpark.png","contentUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/ApacheSpark.png","width":320,"height":136},{"@type":"BreadcrumbList","@id":"https:\/\/streamhub.co.uk\/apache-spark-tuning-manual\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/streamhub.co.uk\/"},{"@type":"ListItem","position":2,"name":"Apache Spark Tuning Manual"}]},{"@type":"WebSite","@id":"https:\/\/streamhub.co.uk\/#website","url":"https:\/\/streamhub.co.uk\/","name":"Streamhub.co.uk","description":"Streamhub.co.uk","publisher":{"@id":"https:\/\/streamhub.co.uk\/#organization"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/streamhub.co.uk\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"},{"@type":"Organization","@id":"https:\/\/streamhub.co.uk\/#organization","name":"Streamhub.co.uk","url":"https:\/\/streamhub.co.uk\/","logo":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/streamhub.co.uk\/#\/schema\/logo\/image\/","url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/05\/SH-Logo.png","contentUrl":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2020\/05\/SH-Logo.png","width":1397,"height":361,"caption":"Streamhub.co.uk"},"image":{"@id":"https:\/\/streamhub.co.uk\/#\/schema\/logo\/image\/"},"sameAs":["https:\/\/www.linkedin.com\/company\/3006156\/admin\/feed\/posts\/"]},{"@type":"Person","@id":"https:\/\/streamhub.co.uk\/#\/schema\/person\/71fc449cd1d4677990e1942bee805c41","name":"Sambodhi","image":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/secure.gravatar.com\/avatar\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g","url":"https:\/\/secure.gravatar.com\/avatar\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/9fb7cf9dc087f6bfa1d49b3efc66fdad1da54b9c6da8aca96898046519f25058?s=96&d=mm&r=g","caption":"Sambodhi"},"url":false}]}},"views":295,"jetpack_publicize_connections":[],"jetpack_featured_media_url":"https:\/\/streamhub.co.uk\/wp-content\/uploads\/2019\/11\/ApacheSpark.png","jetpack_sharing_enabled":true,"_links":{"self":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts\/28376","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/users\/6"}],"replies":[{"embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/comments?post=28376"}],"version-history":[{"count":50,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts\/28376\/revisions"}],"predecessor-version":[{"id":31312,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/posts\/28376\/revisions\/31312"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/media\/28377"}],"wp:attachment":[{"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/media?parent=28376"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/categories?post=28376"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/streamhub.co.uk\/wp-json\/wp\/v2\/tags?post=28376"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}