Prepare data import time import numpy as np import matplotlib.pyplot as plt from collections import OrderedDict from pyspark import SparkContext from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.tree import RandomForest from pyspark.mllib.classification import LabeledPoint %matplotlib inline sc = SparkContext() clean.csv is original file with nan filled with mean of the column. features = sc.textFile(‘/home/sergey/MachineLearning/biline/clean.csv’) features = features.map(lambda…
import numpy as np import matplotlib.pyplot as plt %matplotlib inline from math import log def logloss(p, y): epsilon = 10e-12 if p == 0: p += epsilon if p == 1: p -= epsilon if y == 1: return -log(p) if y == 0: return -log(1-p) def evaluate_logloss(p,labels): return sum(list(map(lambda x: logloss(p,x), labels)))/len(labels)…