Note that byte arrays could be saved many places, also within files or similar.
The extension of a file can be discovered by inspect the File header. This is the first bytes, usually the first tens or hundreds of bytes of the byte array and constitute the file header. Some extensions got multiple file headers. A best effort to identity byte contents of a column in a database.
Let's use Powershell to inspect a file on disk, a sample JPEG file (.jpg). Lets run the following little script:
format-hex .\Stavkyrkje_Røldal.jpg | Select-Object -First 16
The first few bytes are FF D8 FF
I have added a sample Githu repo with utility code to check well-known file types for their file extensions.
https://github.com/toreaurstadboss/FileHeaderUtil
The following screenshot shows the application in use. It found out that a byte array seems to be a PDF file by looking at the file header and file trailer. A good match was found :
In fact, a very good match, since both the header and the trailer fully agrees. Note that the 0A bytes are just padding bytes at the end of files and ignored in this util. See the method NormalizeHex presented further below.
FileSignatureUtil.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace FileHeaderUtil;
public static class FileSignatureUtil
{
static FileSignature[] _fileSignatures = [];
static FileSignatureUtil()
{
string json = File.ReadAllText("file_Sigs.json");
var fileSignaturesRoot = System.Text.Json.JsonSerializer.Deserialize<FileSignatureRootElement>(json, new System.Text.Json.JsonSerializerOptions
{
PropertyNameCaseInsensitive = true
});
_fileSignatures = fileSignaturesRoot?.FileSigs?.ToArray()!;
}
/// <summary>
/// Scans the specified file and returns a list of file signatures that match the file's header and, if applicable,
/// file's trailer.
/// </summary>
/// <remarks>Only file signatures with a defined header are considered for matching. Trailer matching is
/// performed if both the file and the signature define a trailer. A header and trailer of 64 bytes is evaluted to also
/// detect file types / extensions with longer headers and trailers.</remarks>
/// <param name="targetFile">The path to the file to be analyzed. Cannot be null or empty.</param>
/// <param name="byteCount">The number of bytes to read from the file for signature matching. Defaults to 64.</param>
/// <param name="offset">The byte offset at which to begin reading the file for signature matching. Defaults to 0.</param>
/// <param name="origin">Specifies the reference point used to obtain the offset. Defaults to <see cref="SeekOrigin.Begin"/>.</param>
/// <returns>A list of <see cref="FileSignature"/> objects that match the file's header and trailer. The list is empty if no
/// signatures match.</returns>
public static List<FileSignature> GetMatchingFileSignatures(string targetFile, int byteCount = 64, int offset = 0, SeekOrigin origin = SeekOrigin.Begin)
{
static string NormalizeHex(string? hex, bool trimPadding)
{
if (string.IsNullOrWhiteSpace(hex))
{
return string.Empty;
}
var parts = hex.Replace("-", " ").Split(new[] { ' ', }, StringSplitOptions.RemoveEmptyEntries)
.Select(h => h.ToUpperInvariant())
.ToList();
if (trimPadding)
{
while (parts.Count > 0 && (parts.Last() == "0A" || parts.Last() == "0D" || parts.Last() == "00"))
{
parts.RemoveAt(parts.Count - 1);
}
}
return string.Join(" ", parts);
}
var matches = new List<(FileSignature Sig, int Score)>();
string fileHeader = NormalizeHex(FileUtil.ShowHeader(targetFile, offset: 0), trimPadding: false);
string fileTrailer = NormalizeHex(FileUtil.ShowTrailer(targetFile), trimPadding: true);
foreach (var signature in _fileSignatures)
{
if (string.IsNullOrWhiteSpace(signature?.HeaderHex) || signature.HeaderHex == "(NULL)")
continue;
string sigHeader = NormalizeHex(signature.HeaderHex, trimPadding: false);
string sigTrailer = NormalizeHex(signature.TrailerHex, trimPadding: true);
if (!fileHeader.StartsWith(sigHeader, StringComparison.OrdinalIgnoreCase))
continue;
// Trailer check if defined
if (!string.IsNullOrWhiteSpace(sigTrailer) && sigTrailer != "(NULL)")
{
if (!fileTrailer.EndsWith(sigTrailer, StringComparison.OrdinalIgnoreCase))
continue;
}
// Compute match score (# of matching bytes in header and trailer of file)
int headerScore = CountMatchingPrefix(fileHeader, sigHeader);
int trailerScore = CountMatchingSuffix(fileTrailer, sigTrailer);
int scoreMeasuredAsMatchingByteCount = headerScore + trailerScore;
signature.MatchingBytesCount = scoreMeasuredAsMatchingByteCount;
signature.MatchingTrailerBytesCount = trailerScore;
signature.MatchingHeaderBytesCount = headerScore;
matches.Add((signature, scoreMeasuredAsMatchingByteCount));
}
return matches.OrderByDescending(m => m.Score).Select(m => m.Sig).ToList();
}
// Helpers
private static int CountMatchingPrefix(string source, string pattern)
{
var srcParts = source.Split(' ');
var patParts = pattern.Split(' ');
int count = 0;
for (int i = 0; i < Math.Min(srcParts.Length, patParts.Length); i++)
{
if (srcParts[i].Equals(patParts[i], StringComparison.OrdinalIgnoreCase))
count++;
else break;
}
return count;
}
private static int CountMatchingSuffix(string source, string pattern)
{
if (string.IsNullOrWhiteSpace(pattern)) return 0;
var srcParts = source.Split(' ');
var patParts = pattern.Split(' ');
int count = 0;
for (int i = 0; i < Math.Min(srcParts.Length, patParts.Length); i++)
{
if (srcParts[srcParts.Length - 1 - i].Equals(patParts[patParts.Length - 1 - i], StringComparison.OrdinalIgnoreCase))
count++;
else break;
}
return count;
}
}


No comments:
Post a Comment