<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: MarwanRadwanRiad</title>
    <description>The latest articles on DEV Community by MarwanRadwanRiad (@marwanradwanriad).</description>
    <link>https://dev.to/marwanradwanriad</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F1120444%2Fe92511ae-86cb-47ae-8da2-e39ebc159411.png</url>
      <title>DEV Community: MarwanRadwanRiad</title>
      <link>https://dev.to/marwanradwanriad</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/marwanradwanriad"/>
    <language>en</language>
    <item>
      <title>Error when Run pyspark on cluster windows</title>
      <dc:creator>MarwanRadwanRiad</dc:creator>
      <pubDate>Sat, 15 Jul 2023 20:37:16 +0000</pubDate>
      <link>https://dev.to/marwanradwanriad/run-pyspark-on-cluster-windows-2hl2</link>
      <guid>https://dev.to/marwanradwanriad/run-pyspark-on-cluster-windows-2hl2</guid>
      <description>&lt;p&gt;when run code on local work fine but if run it on master and worker return error Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.&lt;br&gt;
: org.apache.spark.SparkException: Job aborted due to stage failure: Task ١ in stage 0.0 failed ٤ times, most recent failure: Lost task 1.3 in stage 0.0 (TID 5) (192.168.1.7 executor 0): java.io.IOException: Failed to delete original file 'C:\Users\me\AppData\Local\Temp\spark-a1d455a2-235e-4246-aebe-a5a6c65e3366\executor-82f10b93-6cd7-4046-b825-68015156deb2\spark-be325be5-2631-4e3b-aa8d-18a13f2d58bd\broadcast1438069735290788560' after copy to 'C:\Users\me\AppData\Local\Temp\spark-a1d455a2-235e-4246-aebe-a5a6c65e3366\executor-82f10b93-6cd7-4046-b825-68015156deb2\blockmgr-08f31c2a-5b65-413b-a111-ca1f9fa90aea\37\broadcast_0_python'&lt;br&gt;
and this is code &lt;br&gt;
from pyspark import SparkContext, SparkConf, broadcast&lt;br&gt;
from typing import List&lt;br&gt;
import findspark&lt;/p&gt;

&lt;p&gt;if &lt;strong&gt;name&lt;/strong&gt; == "&lt;strong&gt;main&lt;/strong&gt;":&lt;br&gt;
    findspark.init("c:\spark-3.4.1")&lt;br&gt;
    conf = SparkConf().setAppName("MyApp").setMaster("spark://ipaddress:7077")&lt;br&gt;
    conf.set("spark.executor.memory", "4g")&lt;br&gt;
    conf.set("spark.deployMode", "cluster")&lt;br&gt;
    #conf.set("spark.task.partitioner", "random")&lt;br&gt;
    conf.set("spark.executor.memory", "4g")&lt;br&gt;
    conf.set("spark.driver.memory", "4g")&lt;br&gt;
    conf.set("spark.executor.instances", "2")&lt;br&gt;
    conf.set("spark.worker.cleanup.interval", "600")&lt;br&gt;
    #conf.set("spark.local.dir", "E:\aaa")&lt;br&gt;
    conf.set("spark.worker.cleanup.enabled",True)&lt;br&gt;
    conf.set("spark.broadcast.reuse",False)&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;sc = SparkContext(conf=conf)

# Define the array to be shared
my_array = [1, 2, 3, 4, 5]

# Broadcast the array to all worker nodes
broadcast_array = sc.broadcast(my_array)

# Define the function to be run on the array
def my_function(x):
    # Access the broadcasted array
    arr = broadcast_array.value
    return [i * x for i in arr]

# Create RDD and run the function multiple times on the shared array
rdd = sc.parallelize([1, 2, 3, 4])
results = rdd.flatMap(my_function).collect()

# Stop the Spark context
sc.stop()

# Print the results
print(results)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
    </item>
  </channel>
</rss>
