'Make Spark application use all available YARN resources

I am currently using a cluster of 5 Raspberry Pi 4 (4GB) and I installed Hadoop to manage the resources. Unfortunately I am not able to config the settings right to use the full resources (4 worker nodes, 1 master node) for the Apache Spark Application, which I submit on top of the Hadoop Framework.

Does somebody knows, how I have to config the settings right to use the full resources (16 cores, 14 GB RAM) for only 1 application?

My current settings are: mapred-site.xml

<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.env</name>
    <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
  </property>
  <property>
    <name>mapreduce.map.env</name>
    <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
  </property>
  <property>
    <name>mapreduce.reduce.env</name>
    <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.resource.memory-mb</name>
    <value>3584</value> <!--512-->
  </property>
  <property>
    <name>mapreduce.map.resource.memory-mb</name>
    <value>3584</value> <!--256-->
  </property>
  <property>
    <name>mapreduce.reduce.resource.memory-mb</name>
    <value>3584</value> <!--256-->
  </property>
</configuration>

yarn-site.xml

<configuration>

<!-- Site specific YARN configuration properties -->
  <property>
    <name>yarn.acl.enable</name>
    <value>0</value>
  </property>
  <property>
    <name>yarn.resourcemanager.hostname</name>
    <value>pi1</value>
  </property>  
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>3584</value> <!--1536-->
  </property>
  <property>
    <name>yarn.nodemanager.resource.cpu-vcores</name>
    <value>8</value>
  </property>
  <property>
    <name>yarn.scheduler.maximum-allocation-mb</name>
    <value>3584</value> <!--1536-->
  </property>
  <property>
    <name>yarn.scheduler.minimum-allocation-mb</name>
    <value>64</value> <!--128-->
  </property>
  <property>
    <name>yarn.scheduler.minimum-allocation-vcores</name>
    <value>1</value> <!--128-->
  </property>
  <property>
    <name>yarn.scheduler.maximum-allocation-vcores</name>
    <value>8</value> <!--128-->
  </property>
  <property>
    <name>yarn.nodemanager.vmem-check-enabled</name>
    <value>true</value>
  </property>
</configuration>

spark-defaults.config

# Example:
# spark.master                     spark://master:7077
# spark.eventLog.enabled           true
# spark.eventLog.dir               hdfs://namenode:8021/directory
# spark.serializer                 org.apache.spark.serializer.KryoSerializer
# spark.driver.memory              5g
# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"

spark.master yarn
spark.driver.memory 2048m
spark.yarn.am.memory 512m
spark.executor.memory 1024m
spark.executor.cores 4
#spark.driver.memory 512m
#spark.yarn.am.memory 512m
#spark.executor.memory 512m

spark.eventLog.enabled true
spark.eventLog.dir hdfs://pi1:9000/spark-logs
spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider
spark.history.fs.logDirectory hdfs://pi1:9000/spark-logs
spark.history.fs.update.interval 10s
spark.history.ui.port 18080

If somebody has a suggestion, I would be really thankful. :)

P.s: If more information are required, just tell me.



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source