#region License Information /* HeuristicLab * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using System.Threading; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Encodings.PermutationEncoding; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.PluginInfrastructure; using HeuristicLab.Problems.DataAnalysis; using HeuristicLab.Random; using HEAL.Attic; namespace HeuristicLab.Algorithms.DataAnalysis { [StorableType("FC8D8E5A-D16D-41BB-91CF-B2B35D17ADD7")] [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 95)] [Item("Decision Tree Regression (DT)", "A regression tree / rule set learner")] public sealed class DecisionTreeRegression : FixedDataAnalysisAlgorithm { public override bool SupportsPause { get { return true; } } public const string RegressionTreeParameterVariableName = "RegressionTreeParameters"; public const string ModelVariableName = "Model"; public const string PruningSetVariableName = "PruningSet"; public const string TrainingSetVariableName = "TrainingSet"; #region Parameter names private const string GenerateRulesParameterName = "GenerateRules"; private const string HoldoutSizeParameterName = "HoldoutSize"; private const string SplitterParameterName = "Splitter"; private const string MinimalNodeSizeParameterName = "MinimalNodeSize"; private const string LeafModelParameterName = "LeafModel"; private const string PruningTypeParameterName = "PruningType"; private const string SeedParameterName = "Seed"; private const string SetSeedRandomlyParameterName = "SetSeedRandomly"; private const string UseHoldoutParameterName = "UseHoldout"; #endregion #region Parameter properties public IFixedValueParameter GenerateRulesParameter { get { return (IFixedValueParameter)Parameters[GenerateRulesParameterName]; } } public IFixedValueParameter HoldoutSizeParameter { get { return (IFixedValueParameter)Parameters[HoldoutSizeParameterName]; } } public IConstrainedValueParameter SplitterParameter { get { return (IConstrainedValueParameter)Parameters[SplitterParameterName]; } } public IFixedValueParameter MinimalNodeSizeParameter { get { return (IFixedValueParameter)Parameters[MinimalNodeSizeParameterName]; } } public IConstrainedValueParameter LeafModelParameter { get { return (IConstrainedValueParameter)Parameters[LeafModelParameterName]; } } public IConstrainedValueParameter PruningTypeParameter { get { return (IConstrainedValueParameter)Parameters[PruningTypeParameterName]; } } public IFixedValueParameter SeedParameter { get { return (IFixedValueParameter)Parameters[SeedParameterName]; } } public IFixedValueParameter SetSeedRandomlyParameter { get { return (IFixedValueParameter)Parameters[SetSeedRandomlyParameterName]; } } public IFixedValueParameter UseHoldoutParameter { get { return (IFixedValueParameter)Parameters[UseHoldoutParameterName]; } } #endregion #region Properties public bool GenerateRules { get { return GenerateRulesParameter.Value.Value; } set { GenerateRulesParameter.Value.Value = value; } } public double HoldoutSize { get { return HoldoutSizeParameter.Value.Value; } set { HoldoutSizeParameter.Value.Value = value; } } public ISplitter Splitter { get { return SplitterParameter.Value; } // no setter because this is a constrained parameter } public int MinimalNodeSize { get { return MinimalNodeSizeParameter.Value.Value; } set { MinimalNodeSizeParameter.Value.Value = value; } } public ILeafModel LeafModel { get { return LeafModelParameter.Value; } } public IPruning Pruning { get { return PruningTypeParameter.Value; } } public int Seed { get { return SeedParameter.Value.Value; } set { SeedParameter.Value.Value = value; } } public bool SetSeedRandomly { get { return SetSeedRandomlyParameter.Value.Value; } set { SetSeedRandomlyParameter.Value.Value = value; } } public bool UseHoldout { get { return UseHoldoutParameter.Value.Value; } set { UseHoldoutParameter.Value.Value = value; } } #endregion #region State [Storable] private IScope stateScope; #endregion #region Constructors and Cloning [StorableConstructor] private DecisionTreeRegression(StorableConstructorFlag _) : base(_) { } private DecisionTreeRegression(DecisionTreeRegression original, Cloner cloner) : base(original, cloner) { stateScope = cloner.Clone(stateScope); } public DecisionTreeRegression() { var modelSet = new ItemSet(ApplicationManager.Manager.GetInstances()); var pruningSet = new ItemSet(ApplicationManager.Manager.GetInstances()); var splitterSet = new ItemSet(ApplicationManager.Manager.GetInstances()); Parameters.Add(new FixedValueParameter(GenerateRulesParameterName, "Whether a set of rules or a decision tree shall be created (default=false)", new BoolValue(false))); Parameters.Add(new FixedValueParameter(HoldoutSizeParameterName, "How much of the training set shall be reserved for pruning (default=20%).", new PercentValue(0.2))); Parameters.Add(new ConstrainedValueParameter(SplitterParameterName, "The type of split function used to create node splits (default='Splitter').", splitterSet, splitterSet.OfType().First())); Parameters.Add(new FixedValueParameter(MinimalNodeSizeParameterName, "The minimal number of samples in a leaf node (default=1).", new IntValue(1))); Parameters.Add(new ConstrainedValueParameter(LeafModelParameterName, "The type of model used for the nodes (default='LinearLeaf').", modelSet, modelSet.OfType().First())); Parameters.Add(new ConstrainedValueParameter(PruningTypeParameterName, "The type of pruning used (default='ComplexityPruning').", pruningSet, pruningSet.OfType().First())); Parameters.Add(new FixedValueParameter(SeedParameterName, "The random seed used to initialize the new pseudo random number generator.", new IntValue(0))); Parameters.Add(new FixedValueParameter(SetSeedRandomlyParameterName, "True if the random seed should be set to a random value, otherwise false.", new BoolValue(true))); Parameters.Add(new FixedValueParameter(UseHoldoutParameterName, "True if a holdout set should be generated, false if splitting and pruning shall be performed on the same data (default=false).", new BoolValue(false))); Problem = new RegressionProblem(); } public override IDeepCloneable Clone(Cloner cloner) { return new DecisionTreeRegression(this, cloner); } #endregion protected override void Initialize(CancellationToken cancellationToken) { base.Initialize(cancellationToken); var random = new MersenneTwister(); if (SetSeedRandomly) Seed = RandomSeedGenerator.GetSeed(); random.Reset(Seed); stateScope = InitializeScope(random, Problem.ProblemData, Pruning, MinimalNodeSize, LeafModel, Splitter, GenerateRules, UseHoldout, HoldoutSize); stateScope.Variables.Add(new Variable("Algorithm", this)); Results.AddOrUpdateResult("StateScope", stateScope); } protected override void Run(CancellationToken cancellationToken) { var model = Build(stateScope, Results, cancellationToken); AnalyzeSolution(model.CreateRegressionSolution(Problem.ProblemData), Results, Problem.ProblemData); } #region Static Interface public static IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, IRandom random, ILeafModel leafModel = null, ISplitter splitter = null, IPruning pruning = null, bool useHoldout = false, double holdoutSize = 0.2, int minimumLeafSize = 1, bool generateRules = false, ResultCollection results = null, CancellationToken? cancellationToken = null) { if (leafModel == null) leafModel = new LinearLeaf(); if (splitter == null) splitter = new Splitter(); if (cancellationToken == null) cancellationToken = CancellationToken.None; if (pruning == null) pruning = new ComplexityPruning(); var stateScope = InitializeScope(random, problemData, pruning, minimumLeafSize, leafModel, splitter, generateRules, useHoldout, holdoutSize); var model = Build(stateScope, results, cancellationToken.Value); return model.CreateRegressionSolution(problemData); } public static void UpdateModel(IDecisionTreeModel model, IRegressionProblemData problemData, IRandom random, ILeafModel leafModel, CancellationToken? cancellationToken = null) { if (cancellationToken == null) cancellationToken = CancellationToken.None; var regressionTreeParameters = new RegressionTreeParameters(leafModel, problemData, random); var scope = new Scope(); scope.Variables.Add(new Variable(RegressionTreeParameterVariableName, regressionTreeParameters)); leafModel.Initialize(scope); model.Update(problemData.TrainingIndices.ToList(), scope, cancellationToken.Value); } #endregion #region Helpers private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool generateRules, bool useHoldout, double holdoutSize) { var stateScope = new Scope("RegressionTreeStateScope"); //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise var doubleVars = new HashSet(problemData.Dataset.DoubleVariables); var vars = problemData.AllowedInputVariables.Concat(new[] {problemData.TargetVariable}).ToArray(); if (vars.Any(v => !doubleVars.Contains(v))) throw new NotSupportedException("Decision tree regression supports only double valued input or output features."); var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray(); if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x)))) throw new NotSupportedException("Decision tree regression does not support NaN or infinity values in the input dataset."); var trainingData = new Dataset(vars, doubles); var pd = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable); pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows; pd.TrainingPartition.Start = 0; //store regression tree parameters var regressionTreeParams = new RegressionTreeParameters(pruning, minLeafSize, leafModel, pd, random, splitter); stateScope.Variables.Add(new Variable(RegressionTreeParameterVariableName, regressionTreeParams)); //initialize tree operators pruning.Initialize(stateScope); splitter.Initialize(stateScope); leafModel.Initialize(stateScope); //store unbuilt model IItem model; if (generateRules) { model = RegressionRuleSetModel.CreateRuleModel(problemData.TargetVariable, regressionTreeParams); RegressionRuleSetModel.Initialize(stateScope); } else { model = RegressionNodeTreeModel.CreateTreeModel(problemData.TargetVariable, regressionTreeParams); } stateScope.Variables.Add(new Variable(ModelVariableName, model)); //store training & pruning indices IReadOnlyList trainingSet, pruningSet; GeneratePruningSet(pd.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingSet, out pruningSet); stateScope.Variables.Add(new Variable(TrainingSetVariableName, new IntArray(trainingSet.ToArray()))); stateScope.Variables.Add(new Variable(PruningSetVariableName, new IntArray(pruningSet.ToArray()))); return stateScope; } private static IRegressionModel Build(IScope stateScope, ResultCollection results, CancellationToken cancellationToken) { var regressionTreeParams = (RegressionTreeParameters)stateScope.Variables[RegressionTreeParameterVariableName].Value; var model = (IDecisionTreeModel)stateScope.Variables[ModelVariableName].Value; var trainingRows = (IntArray)stateScope.Variables[TrainingSetVariableName].Value; var pruningRows = (IntArray)stateScope.Variables[PruningSetVariableName].Value; if (1 > trainingRows.Length) return new PreconstructedLinearModel(new Dictionary(), 0, regressionTreeParams.TargetVariable); if (regressionTreeParams.MinLeafSize > trainingRows.Length) { var targets = regressionTreeParams.Data.GetDoubleValues(regressionTreeParams.TargetVariable).ToArray(); return new PreconstructedLinearModel(new Dictionary(), targets.Average(), regressionTreeParams.TargetVariable); } model.Build(trainingRows.ToArray(), pruningRows.ToArray(), stateScope, results, cancellationToken); return model; } private static void GeneratePruningSet(IReadOnlyList allrows, IRandom random, bool useHoldout, double holdoutSize, out IReadOnlyList training, out IReadOnlyList pruning) { if (!useHoldout) { training = allrows; pruning = allrows; return; } var perm = new Permutation(PermutationTypes.Absolute, allrows.Count, random); var cut = (int)(holdoutSize * allrows.Count); pruning = perm.Take(cut).Select(i => allrows[i]).ToArray(); training = perm.Take(cut).Select(i => allrows[i]).ToArray(); } private void AnalyzeSolution(IRegressionSolution solution, ResultCollection results, IRegressionProblemData problemData) { results.Add(new Result("RegressionSolution", (IItem)solution.Clone())); Dictionary frequencies = null; var tree = solution.Model as RegressionNodeTreeModel; if (tree != null) { results.Add(RegressionTreeAnalyzer.CreateLeafDepthHistogram(tree)); frequencies = RegressionTreeAnalyzer.GetTreeVariableFrequences(tree); RegressionTreeAnalyzer.AnalyzeNodes(tree, results, problemData); } var ruleSet = solution.Model as RegressionRuleSetModel; if (ruleSet != null) { results.Add(RegressionTreeAnalyzer.CreateRulesResult(ruleSet, problemData, "Rules", true)); frequencies = RegressionTreeAnalyzer.GetRuleVariableFrequences(ruleSet); results.Add(RegressionTreeAnalyzer.CreateCoverageDiagram(ruleSet, problemData)); } //Variable frequencies if (frequencies != null) { var sum = frequencies.Values.Sum(); sum = sum == 0 ? 1 : sum; var impactArray = new DoubleArray(frequencies.Select(i => (double)i.Value / sum).ToArray()) { ElementNames = frequencies.Select(i => i.Key) }; results.Add(new Result("Variable Frequences", "relative frequencies of variables in rules and tree nodes", impactArray)); } var pruning = Pruning as ComplexityPruning; if (pruning != null && tree != null) RegressionTreeAnalyzer.PruningChart(tree, pruning, results); } #endregion } }