import pandas as pd
from datetime import datetime as dt

 

weekdays = pd.date_range("2015-01-01","2016-12-31", freq="B")
[print(x) for x in weekdays[:10]]
2015-01-01 00:00:00
2015-01-02 00:00:00
2015-01-05 00:00:00
2015-01-06 00:00:00
2015-01-07 00:00:00
2015-01-08 00:00:00
2015-01-09 00:00:00
2015-01-12 00:00:00
2015-01-13 00:00:00
2015-01-14 00:00:00
[None, None, None, None, None, None, None, None, None, None]

 

weekdaysM10 = weekdays[9:]
weekdaysM90 = weekdays[89:]

 

arr = zip(weekdays, weekdaysM10, weekdaysM90)
date, dateM10, dateM90 = zip(*arr)

 

df = pd.DataFrame({'datem10':dateM10, "datem90":dateM90}, index=date)

 

df.reset_index(inplace=True)
df.head(5)
index datem10 datem90
0 2015-01-01 2015-01-14 2015-05-06
1 2015-01-02 2015-01-15 2015-05-07
2 2015-01-05 2015-01-16 2015-05-08
3 2015-01-06 2015-01-19 2015-05-11
4 2015-01-07 2015-01-20 2015-05-12

 

dfSpark = spark.createDataFrame(df)
dfSpark.printSchema()
root
 |-- index: long (nullable = true)
 |-- datem10: long (nullable = true)
 |-- datem90: long (nullable = true)

 

dfSpark.show(5)
+-------------------+-------------------+-------------------+
|              index|            datem10|            datem90|
+-------------------+-------------------+-------------------+
|1420070400000000000|1421193600000000000|1430870400000000000|
|1420156800000000000|1421280000000000000|1430956800000000000|
|1420416000000000000|1421366400000000000|1431043200000000000|
|1420502400000000000|1421625600000000000|1431302400000000000|
|1420588800000000000|1421712000000000000|1431388800000000000|
+-------------------+-------------------+-------------------+
only showing top 5 rows

 

dfSpark.write.parquet("hdfs:///user/sergey/calendar10")

 

Write a comment:

*

Your email address will not be published.

© 2014 In R we trust.
Top
Follow us: