Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve 'collection modified' exception when logging results #2549

Merged
merged 8 commits into from
Oct 9, 2022
2 changes: 2 additions & 0 deletions src/ReleaseHistory.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# SARIF Package Release History (SDK, Driver, Converters, and Multitool)

## Unreleased
* BUGFIX: Another attempt to resolve 'InvalidOperationException' with message `Collection was modified; enumeration operation may not execute` in `MultithreadedAnalyzeCommandBase`, raised when analyzing with the `--hashes` switch. [#2459](https://github.com/microsoft/sarif-sdk/pull/2549). There was a previous attempt to fix this in [#2447](https://github.com/microsoft/sarif-sdk/pull/2447).

* FEATURE: Allow initialization of file regions cache in `InsertOptionalDataVisitor` (previously initialized exclusively from `FileRegionsCache.Instance`).
* BUGFIX: Resolve issue where `match-results-forward` command fails to generate VersionControlDetails data. [#2487](https://github.com/microsoft/sarif-sdk/pull/2487)
* BUGFIX: Remove duplicated rule definitions when executing `match-results-forward` commands for results with sub-rule ids. [#2486](https://github.com/microsoft/sarif-sdk/pull/2486)
Expand Down
124 changes: 65 additions & 59 deletions src/Sarif.Driver/Sdk/MultithreadedAnalyzeCommandBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,16 @@ public abstract class MultithreadedAnalyzeCommandBase<TContext, TOptions> : Plug
internal bool _captureConsoleOutput;

internal ConsoleLogger _consoleLogger;
internal ConcurrentDictionary<string, IAnalysisLogger> _analysisLoggerCache;

private Run _run;
private Tool _tool;
private bool _computeHashes;
internal TContext _rootContext;
private int _fileContextsCount;
private Channel<int> _hashChannel;
private Channel<int> readyToHashChannel;
private OptionallyEmittedData _dataToInsert;
private Channel<int> _resultsWritingChannel;
private Channel<int> _fileEnumerationChannel;
private Channel<int> readyToScanChannel;
private IDictionary<string, HashData> _pathToHashDataMap;
private ConcurrentDictionary<int, TContext> _fileContexts;

Expand Down Expand Up @@ -190,12 +189,18 @@ private void MultithreadedAnalyzeTargets(TOptions options,
: (Debugger.IsAttached) ? 1 : Environment.ProcessorCount;

var channelOptions = new BoundedChannelOptions(2000)
{
SingleWriter = true,
SingleReader = true,
};
readyToHashChannel = Channel.CreateBounded<int>(channelOptions);
Copy link
Member Author

@michaelcfanning michaelcfanning Oct 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

readyToHashChannel

Currently the hashing channel is single-threaded. There is an opportunity here, I think, to improve performance by making this phase multi-threaded.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have a line break here and six lines down to improve readability?


channelOptions = new BoundedChannelOptions(2000)
{
SingleWriter = true,
SingleReader = false,
};
_fileEnumerationChannel = Channel.CreateBounded<int>(channelOptions);
_hashChannel = Channel.CreateBounded<int>(channelOptions);
readyToScanChannel = Channel.CreateBounded<int>(channelOptions);

channelOptions = new BoundedChannelOptions(2000)
{
Expand All @@ -206,26 +211,39 @@ private void MultithreadedAnalyzeTargets(TOptions options,

var sw = Stopwatch.StartNew();

var workers = new Task<bool>[options.Threads];
// 1: First we initiate an asynchronous operation to locate disk files for
// analysis, as specified in analysis configuration (file names, wildcards).
Task<bool> enumerateFilesOnDisk = EnumerateFilesOnDiskAsync(options, rootContext);

// 2: Files found on disk are put in a specific sort order, after which a
// reference to each scan target is put into a channel for hashing,
// if hashing is enabled.
Task<bool> hashFilesAndPutInAnalysisQueue = HashFilesAndPutInAnalysisQueueAsnc();

// 3: A dedicated set of threads pull scan targets and analyze them.
// On completing a scan, the thread writes the index of the
// scanned item to a channel that drives logging.
var workers = new Task<bool>[options.Threads];
for (int i = 0; i < options.Threads; i++)
{
workers[i] = AnalyzeTargetAsync(skimmers, disabledSkimmers);
workers[i] = ScanTargetsAsync(skimmers, disabledSkimmers);
}

Task<bool> hashFiles = HashAsync();
Task<bool> findFiles = FindFilesAsync(options, rootContext);
Task<bool> writeResults = WriteResultsAsync(rootContext);

// FindFiles is single-thread and will close its write channel
findFiles.Wait();
hashFiles.Wait();
// 4: A single-threaded consumer watches for completed scans
// and logs results, if any. This operation is singlle-threaded
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

singlle

typo -> single

// in order to help ensure determinism in log output. i.e., any
// scan of the same targets using the same production code
// should produce a log file that is byte-for-byte identical
// to the previous output.
Task<bool> logScanResults = LogScanResultsAsync(rootContext);

Task.WhenAll(workers)
.ContinueWith(_ => _resultsWritingChannel.Writer.Complete())
.Wait();

writeResults.Wait();
enumerateFilesOnDisk.Wait();
hashFilesAndPutInAnalysisQueue.Wait();
logScanResults.Wait();

Console.WriteLine();

Expand All @@ -239,7 +257,7 @@ private void MultithreadedAnalyzeTargets(TOptions options,
}
}

private async Task<bool> WriteResultsAsync(TContext rootContext)
private async Task<bool> LogScanResultsAsync(TContext rootContext)
{
int currentIndex = 0;

Expand All @@ -265,15 +283,7 @@ private async Task<bool> WriteResultsAsync(TContext rootContext)

while (context?.AnalysisComplete == true)
{
if (_computeHashes)
{
bool cache = _analysisLoggerCache.TryGetValue(context.Hashes.Sha256, out IAnalysisLogger logger);
LogCachingLogger(rootContext, logger ?? context.Logger, context, clone: cache);
}
else
{
LogCachingLogger(rootContext, context.Logger, context);
}
LogCachingLogger(rootContext, context, clone: _computeHashes);

RuntimeErrors |= context.RuntimeErrors;

Expand All @@ -300,9 +310,9 @@ private async Task<bool> WriteResultsAsync(TContext rootContext)
return true;
}

private void LogCachingLogger(TContext rootContext, IAnalysisLogger logger, TContext context, bool clone = false)
private void LogCachingLogger(TContext rootContext, TContext context, bool clone = false)
{
var cachingLogger = (CachingLogger)logger;
var cachingLogger = (CachingLogger)context.Logger;
IDictionary<ReportingDescriptor, IList<Result>> results = cachingLogger.Results;

if (results?.Count > 0)
Expand Down Expand Up @@ -352,7 +362,7 @@ private void LogCachingLogger(TContext rootContext, IAnalysisLogger logger, TCon
}
}

private async Task<bool> FindFilesAsync(TOptions options, TContext rootContext)
private async Task<bool> EnumerateFilesOnDiskAsync(TOptions options, TContext rootContext)
{
this._fileContextsCount = 0;
this._fileContexts = new ConcurrentDictionary<int, TContext>();
Expand Down Expand Up @@ -432,12 +442,12 @@ private async Task<bool> FindFilesAsync(TOptions options, TContext rootContext)
filePath: file)
);

await _hashChannel.Writer.WriteAsync(_fileContextsCount++);
await readyToHashChannel.Writer.WriteAsync(_fileContextsCount++);
}
}
}

_hashChannel.Writer.Complete();
readyToHashChannel.Writer.Complete();

if (_fileContextsCount == 0)
{
Expand All @@ -464,9 +474,11 @@ private void EnqueueAllDirectories(Queue<string> queue, string directory)
}
}

private async Task<bool> HashAsync()
private async Task<bool> HashFilesAndPutInAnalysisQueueAsnc()
{
ChannelReader<int> reader = _hashChannel.Reader;
ChannelReader<int> reader = readyToHashChannel.Reader;

Dictionary<string, CachingLogger> loggerCache = null;

// Wait until there is work or the channel is closed.
while (await reader.WaitToReadAsync())
Expand All @@ -487,20 +499,28 @@ private async Task<bool> HashAsync()
{
_pathToHashDataMap.Add(localPath, hashData);
}

loggerCache ??= new Dictionary<string, CachingLogger>();

if (!loggerCache.TryGetValue(hashData.Sha256, out CachingLogger logger))
{
logger = loggerCache[hashData.Sha256] = (CachingLogger)context.Logger;
}
context.Logger = logger;
}

await _fileEnumerationChannel.Writer.WriteAsync(index);
await readyToScanChannel.Writer.WriteAsync(index);
}
}

_fileEnumerationChannel.Writer.Complete();
readyToScanChannel.Writer.Complete();

return true;
}

private async Task<bool> AnalyzeTargetAsync(IEnumerable<Skimmer<TContext>> skimmers, ISet<string> disabledSkimmers)
private async Task<bool> ScanTargetsAsync(IEnumerable<Skimmer<TContext>> skimmers, ISet<string> disabledSkimmers)
{
ChannelReader<int> reader = _fileEnumerationChannel.Reader;
ChannelReader<int> reader = readyToScanChannel.Reader;

// Wait until there is work or the channel is closed.
while (await reader.WaitToReadAsync())
Expand Down Expand Up @@ -532,9 +552,6 @@ private async Task<bool> AnalyzeTargetAsync(IEnumerable<Skimmer<TContext>> skimm

protected virtual void ValidateOptions(TOptions options, TContext context)
{
_dataToInsert = options.DataToInsert.ToFlags();
_computeHashes = (_dataToInsert & OptionallyEmittedData.Hashes) != 0;

bool succeeded = true;

succeeded &= ValidateFile(context, options.OutputFilePath, DefaultPolicyName, shouldExist: null);
Expand Down Expand Up @@ -564,10 +581,8 @@ internal AggregatingLogger InitializeLogger(AnalyzeOptionsBase analyzeOptions)
logger.Loggers.Add(_consoleLogger);
}

if ((analyzeOptions.DataToInsert.ToFlags() & OptionallyEmittedData.Hashes) != 0)
{
_analysisLoggerCache = new ConcurrentDictionary<string, IAnalysisLogger>();
}
_dataToInsert = analyzeOptions.DataToInsert.ToFlags();
_computeHashes = (_dataToInsert & OptionallyEmittedData.Hashes) != 0;

return logger;
}
Expand Down Expand Up @@ -868,29 +883,20 @@ protected virtual TContext DetermineApplicabilityAndAnalyze(
return context;
}

IAnalysisLogger logger = context.Logger;

if (_computeHashes)
{
if (_analysisLoggerCache.ContainsKey(context.Hashes.Sha256))
{
return context;
}
}

context.Logger.AnalyzingTarget(context);
CachingLogger logger = (CachingLogger)context.Logger;
logger.AnalyzingTarget(context);

if (_computeHashes)
if (logger.CacheFinalized)
{
if (!_analysisLoggerCache.TryAdd(context.Hashes.Sha256, logger))
{
return context;
}
context.Logger = logger;
logger.ReleaseLock();
return context;
}

IEnumerable<Skimmer<TContext>> applicableSkimmers = DetermineApplicabilityForTarget(context, skimmers, disabledSkimmers);
AnalyzeTarget(context, applicableSkimmers, disabledSkimmers);

logger.ReleaseLock();
return context;
}

Expand Down
38 changes: 33 additions & 5 deletions src/Sarif/Core/Artifact.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ public static Artifact Create(
var artifact = new Artifact()
{
Encoding = encoding?.WebName,
Hashes = hashData != null ? CreateHashesDictionary(hashData) : null,
};

string mimeType = SarifWriters.MimeType.DetermineFromFileExtension(uri);
Expand Down Expand Up @@ -70,19 +71,46 @@ public static Artifact Create(
if (dataToInsert.HasFlag(OptionallyEmittedData.Hashes))
{
HashData hashes = hashData ?? HashUtilities.ComputeHashes(filePath);
artifact.Hashes = new Dictionary<string, string>

// The hash utilities will return null data in some text contexts.
if (hashes != null)
{
{ "md5", hashes.MD5 },
{ "sha-1", hashes.Sha1 },
{ "sha-256", hashes.Sha256 },
};
artifact.Hashes = new Dictionary<string, string>
{
{ "md5", hashes.MD5 },
{ "sha-1", hashes.Sha1 },
{ "sha-256", hashes.Sha256 },
};
}
}
}
catch (Exception e) when (e is IOException || e is UnauthorizedAccessException) { }

return artifact;
}

private static IDictionary<string, string> CreateHashesDictionary(HashData hashData)
{
var result = new Dictionary<string, string>();

if (!string.IsNullOrEmpty(hashData?.MD5))
{
result["md5"] = hashData?.MD5;
}

if (!string.IsNullOrEmpty(hashData?.Sha1))
{
result["sha-1"] = hashData?.Sha1;
}

if (!string.IsNullOrEmpty(hashData?.Sha256))
{
result["sha-256"] = hashData?.Sha256;
}

return result;
}

private static ArtifactContent GetEncodedFileContents(IFileSystem fileSystem, string filePath, string mimeType, Encoding inputFileEncoding)
{
var fileContent = new ArtifactContent();
Expand Down
12 changes: 12 additions & 0 deletions src/Sarif/HashUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ public static HashData ComputeHashes(string fileName, IFileSystem fileSystem = n
{
using (Stream stream = fileSystem.FileOpenRead(fileName))
{
// This condition is actually only feasible in testing, as a null
// value will only be returned by a mock object that doesn't
// recognize the current specified file argument. In production,
// an exception will always be raised for a missing file. If
// we enter the code below, that indicates that a test
// encountered an adverse condition and is attempting to produce
// a file hash for some source file in a notification stack. We
// return null here, as the actual source hash isn't interesting
// for this scenario, and we want to reliably finish test execution
// and record exception details.
if (stream == null) { return null; }

using (var bufferedStream = new BufferedStream(stream, 1024 * 32))
{
string md5, sha1, sha256;
Expand Down
Loading