Source code for methylnet.hyperparameter_scans

"""
hyperparameter_scans.py
=======================
Run randomized grid search to find ideal model hyperparameters, with possible deployments to batch system for scalability.
"""

import os, pandas as pd, numpy as np, subprocess
import time
from methylnet.torque_jobs import assemble_run_torque

[docs]def find_top_jobs(hyperparameter_input_csv,hyperparameter_output_log, n_top_jobs, crossover_p=0, val_loss_column='min_val_loss'): """Finds top performing jobs from hyper parameter scan to rerun and cross-over parameters. Parameters ---------- hyperparameter_input_csv : str CSV file containing hyperparameter inputs. hyperparameter_output_log : str CSV file containing prior runs. n_top_jobs : int Number of top jobs to select crossover_p : float Rate of cross over of reused hyperparameters val_loss_column : str Loss column used to select top jobs Returns ------- list List of list of new parameters for jobs to run. """ custom_jobs = pd.read_csv(hyperparameter_input_csv) hyperparam_output = pd.read_csv(hyperparameter_output_log)[['job_name',val_loss_column]] best_outputs = hyperparam_output.sort_values(val_loss_column,ascending=True).iloc[:n_top_jobs,:] custom_jobs = custom_jobs[np.isin(custom_jobs['--job_name'].values,best_outputs['job_name'].values)] if custom_jobs.shape[0]==0 and best_outputs.shape[0]>0: custom_jobs = best_outputs.rename(columns={'--{}'.format(k):k for k in list(best_outputs)})[list(custom_jobs)] custom_jobs.loc[:,'--hidden_layer_topology']=custom_jobs.loc[:,'--hidden_layer_topology'].map(lambda x: x.replace('[','').replace(']','')) custom_jobs.loc[:,'--job_name']='False' if crossover_p: for j in range(1,custom_jobs.shape[1]): vals=custom_jobs.iloc[:,j].unique() for i in range(custom_jobs.shape[0]): if np.random.rand() <= crossover_p: custom_jobs.iloc[i,j]=np.random.choice(vals) return [custom_jobs]
[docs]def replace_grid(old_grid, new_grid, topology_grid): """Replaces old hyperparameter search grid with one supplied by YAML.""" #print(old_grid,new_grid) for k in new_grid.keys(): if k != 'topology_grid': old_grid["--{}".format(k)] = new_grid[k] if 'topology_grid' in new_grid: topology_grid = new_grid.pop('topology_grid') return old_grid, topology_grid
[docs]def generate_topology(topology_grid, probability_decay_factor=0.9): """Generates list denoting neural network topology, list of hidden layer sizes. Parameters ---------- topology_grid : list List of different hidden layer sizes (number neurons) to choose from. probability_decay_factor : float Degree of neural network model complexity for hyperparameter search. Search for less wide networks with a lower complexity value, bounded between 0 and infinity. Returns ------- list List of hidden layer sizes. """ probability_decay_factor=float(max(probability_decay_factor,0.)) p = probability_decay_factor**np.arange(len(topology_grid)) p /= sum(p) n_layer_attempts = int(round(3*probability_decay_factor)) if n_layer_attempts: topology=list(filter(lambda x: x,np.random.choice(topology_grid,n_layer_attempts,p=p))) else: topology=[] if topology: return ','.join(map(str,topology)) else: return '' return ''
[docs]def coarse_scan(hyperparameter_input_csv, hyperparameter_output_log, generate_input, job_chunk_size, stratify_column, reset_all, torque, gpu, gpu_node, nohup, mlp=False, custom_jobs=[], model_complexity_factor=0.9, set_beta=-1., n_jobs=4, categorical=True, add_softmax=False, additional_command = "", cuda=True, new_grid = {}): """Perform randomized hyperparameter grid search Parameters ---------- hyperparameter_input_csv : type CSV file containing hyperparameter inputs. hyperparameter_output_log : type CSV file containing prior runs. generate_input : type Generate hyperparameter input csv. job_chunk_size : type Number of jobs to be launched at same time. stratify_column : type Performing classification? reset_all : type Rerun all jobs previously scanned. torque : type Run jobs using torque. gpu : type What GPU to use, set to -1 to be agnostic to GPU selection. gpu_node : type What GPU to use, set to -1 to be agnostic to GPU selection, for torque submission. nohup : type Launch jobs using nohup. mlp : type If running prediction job (classification/regression) after VAE. custom_jobs : type Supply custom job parameters to be run. model_complexity_factor : type Degree of neural network model complexity for hyperparameter search. Search for less wide networks with a lower complexity value, bounded between 0 and infinity. set_beta : type Don't hyperparameter scan over beta (KL divergence weight), and set it to value. n_jobs : type Number of jobs to generate. categorical : type Classification task? add_softmax : type Add softmax layer at end of neural network. cuda : type Whether to use GPU? """ from itertools import cycle from pathos.multiprocessing import ProcessingPool as Pool os.makedirs(hyperparameter_input_csv[:hyperparameter_input_csv.rfind('/')],exist_ok=True) generated_input=[] np.random.seed(int(time.time())) if mlp: grid={'--learning_rate_vae':[1e-5,5e-5,1e-4,5e-4,1e-3,5e-3,1e-2,5e-2,1e-1],'--learning_rate_mlp':[1e-5,5e-5,1e-4,5e-4,1e-3,5e-3,1e-2,5e-2,1e-1,5e-1], '--weight_decay':[1e-4],'--n_epochs':[25,50,75,100,200,500,700], '--scheduler':['warm_restarts','null'], '--t_max':[10], '--eta_min':[1e-7,1e-6], '--t_mult':[1.,1.2,1.5,2], '--batch_size':[50,100,256,512], '--dropout_p':[0.,0.1,0.2,0.3,0.5], '--n_workers':[4], '--loss_reduction':['sum']} topology_grid = [0,100,200,300,500,1000,2000,3000,4096] else: grid={'--n_latent':[100,150,200,300,500], '--learning_rate':[5e-5,1e-4,5e-4,1e-3,5e-3,1e-2,5e-2,1e-1], '--weight_decay':[1e-4],'--n_epochs':[25,50,75,100,200,500,700], '--kl_warm_up':[0,20], '--beta':[0.,0.5,1,10,50,100,200,500] if set_beta == -1. else [set_beta], '--scheduler':['warm_restarts','null'], '--t_max':[10], '--eta_min':[1e-7,1e-6], '--t_mult':[1.,1.2,1.5,2], '--batch_size':[50,100,256,512], '--n_workers':[4], '--loss_reduction':['sum']} topology_grid=[0,100,200,300,500,1000,2000] if new_grid: grid,topology_grid=replace_grid(grid,new_grid,topology_grid) grid['--hidden_layer_topology' if mlp else '--hidden_layer_encoder_topology']=[generate_topology(topology_grid,probability_decay_factor=model_complexity_factor) for i in range(40)] if generate_input: for i in range(n_jobs): generated_input.append(['False']+[np.random.choice(grid[k]) for k in grid]) generated_input=[pd.DataFrame(generated_input,columns=['--job_name']+list(grid.keys()))] if custom_jobs: custom_jobs[0].loc[:,'--job_name']='False' generated_input=custom_jobs def run(x): print(x) subprocess.call(x,shell=True) lower = lambda x: x.lower() if gpu == -1: gpus=cycle(range(4)) else: gpus=cycle([gpu]) if os.path.exists(hyperparameter_input_csv): df=pd.read_csv(hyperparameter_input_csv) df=[df[[col for col in list(df) if not col.startswith('Unnamed')]]] else: df = [] df=pd.concat(df+generated_input,axis=0)[['--job_name']+list(grid.keys())].fillna('') print(df) if reset_all: df.loc[:,'--job_name']='False' df_final = df[df['--job_name'].astype(str).map(lower)=='false'].reset_index(drop=True)[list(grid.keys())] commands=[] for i in range(df_final.shape[0]): job_id = str(np.random.randint(0,100000000)) if not mlp: commands.append('sh -c "methylnet-embed perform_embedding -bce {} -v -j {} -hl {} -sc {} {} && pymethyl-visualize transform_plot -i embeddings/vae_methyl_arr.pkl -o visualizations/{}_vae_embed.html -c {} -nn 10 "'.format("-c" if cuda else "",job_id,hyperparameter_output_log,stratify_column,' '.join(['{} {}'.format(k2,df_final.loc[i,k2]) for k2 in list(df_final) if (df_final.loc[i,k2] != '' and df_final.loc[i,k2] != np.nan)]),job_id,stratify_column)) else: commands.append('sh -c "methylnet-predict make_prediction {} {} {} {} -v {} -j {} -hl {} {} && {}"'.format("-c" if cuda else "",'-sft' if add_softmax else '','-cat' if categorical else '',''.join([' -ic {}'.format(col) for col in stratify_column]),'-do' if stratify_column[0]=='disease_only' else '',job_id,hyperparameter_output_log,' '.join(['{} {}'.format(k2,df_final.loc[i,k2]) for k2 in list(df_final) if (df_final.loc[i,k2] != '' and df_final.loc[i,k2] != np.nan)]), '&&'.join([" pymethyl-visualize transform_plot -i predictions/vae_mlp_methyl_arr.pkl -o visualizations/{}_{}_mlp_embed.html -c {} -nn 8 ".format(job_id,col,col) for col in stratify_column]))) #-do df.loc[np.arange(df.shape[0])==np.where(df['--job_name'].astype(str).map(lower)=='false')[0][0],'--job_name']=job_id for i in range(len(commands)): commands[i] = '{} {} {} {}'.format('CUDA_VISIBLE_DEVICES="{}"'.format(next(gpus)) if not torque else "",'nohup' if nohup else '',commands[i],'&' if nohup else '') # $gpuNum if torque: for command in commands: job = assemble_run_torque(command, use_gpu=cuda, additions=additional_command, queue='gpuq' if cuda else "normal", time=4, ngpu=1, additional_options='' if gpu_node == -1 else ' -l hostlist=g0{}'.format(gpu_node)) else: if len(commands) == 1: subprocess.call(commands[0],shell=True) else: commands = np.array_split(commands,len(commands)//job_chunk_size) for command_list in commands: if nohup: for command in command_list: print(command) subprocess.call(command,shell=True) else: for command in command_list: subprocess.call(command,shell=True) """pool = Pool(len(command_list)) pool.map(run, command_list) pool.close() pool.join()""" df.to_csv(hyperparameter_input_csv)