def generate_linear_example(samples=int(1e5)):
x_data = np.random.sample(samples)[:, np.newaxis].astype(np.float32)
y_data = np.add(5*x_data, np.multiply((x_data)**2, np.random.standard_normal(x_data.shape)))
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.5, random_state=42)
x_test = np.linspace(0.,1.,int(1e3))[:, np.newaxis].astype(np.float32)
df_train = pd.DataFrame({"col1": x_train.ravel(), "target": y_train.ravel()})
df_valid = pd.DataFrame({"col1": x_valid.ravel(), "target": y_valid.ravel()})
# test = sorted(df_valid.col1.round(3).unique())
# df_test = pd.DataFrame({"col1": test})
df_test = pd.DataFrame({"col1": x_test.ravel()})
return (df_train, df_valid, df_test, ["target"])
def generate_non_linear_example(samples=int(1e5)):
x_data = np.float32(np.random.uniform(-10, 10, (1, samples)))
r_data = np.array([np.random.normal(scale=np.abs(i)) for i in x_data])
y_data = np.float32(np.square(x_data)+r_data*2.0)
x_data2 = np.float32(np.random.uniform(-10, 10, (1, samples)))
r_data2 = np.array([np.random.normal(scale=np.abs(i)) for i in x_data2])
y_data2 = np.float32(-np.square(x_data2)+r_data2*2.0)
x_data = np.concatenate((x_data,x_data2),axis=1).T
y_data = np.concatenate((y_data,y_data2),axis=1).T
min_max_scaler = MinMaxScaler()
y_data = min_max_scaler.fit_transform(y_data)
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.5, random_state=42, shuffle=True)
x_test = np.linspace(-10,10,int(1e3))[:, np.newaxis].astype(np.float32)
df_train = pd.DataFrame({"col1": x_train.ravel(), "target": y_train.ravel()})
df_valid = pd.DataFrame({"col1": x_valid.ravel(), "target": y_valid.ravel()})
# test = sorted(df_valid.col1.round(3).unique())
# df_test = pd.DataFrame({"col1": test})
df_test = pd.DataFrame({"col1": x_test.ravel()})
return (df_train, df_valid, df_test, ["target"])
def generate_step_linear_example(samples=int(1e5)):
x_data = np.random.sample(samples)[:, np.newaxis].astype(np.float32)
y_data = np.zeros(x_data.shape)
mask = x_data<0.5
y_data[mask] = np.add(5*x_data[mask], np.multiply((x_data[mask])**2, np.random.standard_normal(x_data[mask].shape)))
y_data[~mask] = np.add(100*x_data[~mask]+x_data[~mask]**2 , np.multiply((x_data[~mask])**2, np.random.standard_normal(x_data[~mask].shape)))
min_max_scaler = MinMaxScaler()
y_data = min_max_scaler.fit_transform(y_data)
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.5, random_state=42, shuffle=True)
x_test = np.linspace(0.,1.,int(1e3))[:, np.newaxis].astype(np.float32)
df_train = pd.DataFrame({"col1": x_train.ravel(), "target": y_train.ravel()})
df_valid = pd.DataFrame({"col1": x_valid.ravel(), "target": y_valid.ravel()})
# test = sorted(df_valid.col1.round(3).unique())
# df_test = pd.DataFrame({"col1": test})
df_test = pd.DataFrame({"col1": x_test.ravel()})
return (df_train, df_valid, df_test, ["target"])
def generate_gaussian_mixture(samples=int(1e5)):
x_data = np.random.sample(samples)[:, np.newaxis].astype(np.float32)
pi = np.sin(x_data)+3*x_data*np.cos(x_data)
pi = pi/pi.max()
# g1 = np.random.sample(samples)*4*x_data.squeeze()
# g2 = np.random.sample(samples)*15*x_data.squeeze()
g1 = 2*x_data.squeeze() + 0.5*np.random.sample(samples)
g2 = 8*x_data.squeeze() + 0.5*np.random.sample(samples)
y_data = pi.round().squeeze()*g1 + (1-pi.round().squeeze())*g2
y_data = y_data.reshape(-1,1)
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.5, random_state=42)
x_test = np.linspace(0.,1.,int(1e3))[:, np.newaxis].astype(np.float32)
df_train = pd.DataFrame({"col1": x_train.ravel(), "target": y_train.ravel()})
df_valid = pd.DataFrame({"col1": x_valid.ravel(), "target": y_valid.ravel()})
# test = sorted(df_valid.col1.round(3).unique())
# df_test = pd.DataFrame({"col1": test})
df_test = pd.DataFrame({"col1": x_test.ravel()})
return (df_train, df_valid, df_test, ["target"])