import pandas as pd
from datetime import datetime as dt
weekdays = pd.date_range("2015-01-01","2016-12-31", freq="B")
[print(x) for x in weekdays[:10]]
2015-01-01 00:00:00 2015-01-02 00:00:00 2015-01-05 00:00:00 2015-01-06 00:00:00 2015-01-07 00:00:00 2015-01-08 00:00:00 2015-01-09 00:00:00 2015-01-12 00:00:00 2015-01-13 00:00:00 2015-01-14 00:00:00
[None, None, None, None, None, None, None, None, None, None]
weekdaysM10 = weekdays[9:]
weekdaysM90 = weekdays[89:]
arr = zip(weekdays, weekdaysM10, weekdaysM90)
date, dateM10, dateM90 = zip(*arr)
df = pd.DataFrame({'datem10':dateM10, "datem90":dateM90}, index=date)
df.reset_index(inplace=True)
df.head(5)
index | datem10 | datem90 | |
---|---|---|---|
0 | 2015-01-01 | 2015-01-14 | 2015-05-06 |
1 | 2015-01-02 | 2015-01-15 | 2015-05-07 |
2 | 2015-01-05 | 2015-01-16 | 2015-05-08 |
3 | 2015-01-06 | 2015-01-19 | 2015-05-11 |
4 | 2015-01-07 | 2015-01-20 | 2015-05-12 |
dfSpark = spark.createDataFrame(df)
dfSpark.printSchema()
root |-- index: long (nullable = true) |-- datem10: long (nullable = true) |-- datem90: long (nullable = true)
dfSpark.show(5)
+-------------------+-------------------+-------------------+ | index| datem10| datem90| +-------------------+-------------------+-------------------+ |1420070400000000000|1421193600000000000|1430870400000000000| |1420156800000000000|1421280000000000000|1430956800000000000| |1420416000000000000|1421366400000000000|1431043200000000000| |1420502400000000000|1421625600000000000|1431302400000000000| |1420588800000000000|1421712000000000000|1431388800000000000| +-------------------+-------------------+-------------------+ only showing top 5 rows
dfSpark.write.parquet("hdfs:///user/sergey/calendar10")
Write a comment: