diff --git a/lib/protocol/http/header/accept.rb b/lib/protocol/http/header/accept.rb new file mode 100644 index 0000000..82e47d3 --- /dev/null +++ b/lib/protocol/http/header/accept.rb @@ -0,0 +1,134 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require_relative "split" +require_relative "quoted_string" +require_relative "../error" + +module Protocol + module HTTP + module Header + # The `accept-content-type` header represents a list of content-types that the client can accept. + class Accept < Array + # Regular expression used to split values on commas, with optional surrounding whitespace, taking into account quoted strings. + SEPARATOR = / + (?: # Start non-capturing group + "[^"\\]*" # Match quoted strings (no escaping of quotes within) + | # OR + [^,"]+ # Match non-quoted strings until a comma or quote + )+ + (?=,|\z) # Match until a comma or end of string + /x + + ParseError = Class.new(Error) + + MEDIA_RANGE = /\A(?#{TOKEN})\/(?#{TOKEN})(?.*)\z/ + + PARAMETER = /\s*;\s*(?#{TOKEN})=((?#{TOKEN})|(?#{QUOTED_STRING}))/ + + # A single entry in the Accept: header, which includes a mime type and associated parameters. A media range can include wild cards, but a media type is a specific type and subtype. + MediaRange = Struct.new(:type, :subtype, :parameters) do + def initialize(type, subtype = "*", parameters = {}) + super(type, subtype, parameters) + end + + def <=> other + other.quality_factor <=> self.quality_factor + end + + def parameters_string + return "" if parameters == nil or parameters.empty? + + parameters.collect do |key, value| + ";#{key.to_s}=#{QuotedString.quote(value.to_s)}" + end.join + end + + def === other + if other.is_a? self.class + super + else + return self.range_string === other + end + end + + def range_string + "#{type}/#{subtype}" + end + + def to_s + "#{type}/#{subtype}#{parameters_string}" + end + + alias to_str to_s + + def quality_factor + parameters.fetch("q", 1.0).to_f + end + + def split(*args) + return [type, subtype] + end + end + + # Parse the `accept` header value into a list of content types. + # + # @parameter value [String] the value of the header. + def initialize(value = nil) + if value + super(value.scan(SEPARATOR).map(&:strip)) + end + end + + # Adds one or more comma-separated values to the header. + # + # The input string is split into distinct entries and appended to the array. + # + # @parameter value [String] the value or values to add, separated by commas. + def << (value) + self.concat(value.scan(SEPARATOR).map(&:strip)) + end + + # Serializes the stored values into a comma-separated string. + # + # @returns [String] the serialized representation of the header values. + def to_s + join(",") + end + + # Parse the `accept` header. + # + # @returns [Array(Charset)] the list of content types and their associated parameters. + def media_ranges + self.map do |value| + self.parse_media_range(value) + end + end + + private + + def parse_media_range(value) + if match = value.match(MEDIA_RANGE) + type = match[:type] + subtype = match[:subtype] + parameters = {} + + match[:parameters].scan(PARAMETER) do |key, value, quoted_value| + if quoted_value + value = QuotedString.unquote(quoted_value) + end + + parameters[key] = value + end + + return MediaRange.new(type, subtype, parameters) + else + raise ParseError, "Invalid media type: #{value.inspect}" + end + end + end + end + end +end diff --git a/lib/protocol/http/header/accept_charset.rb b/lib/protocol/http/header/accept_charset.rb new file mode 100644 index 0000000..2774db6 --- /dev/null +++ b/lib/protocol/http/header/accept_charset.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require_relative "split" +require_relative "quoted_string" +require_relative "../error" + +module Protocol + module HTTP + module Header + # The `accept-charset` header represents a list of character sets that the client can accept. + class AcceptCharset < Split + ParseError = Class.new(Error) + + # https://tools.ietf.org/html/rfc7231#section-5.3.3 + CHARSET = /\A(?#{TOKEN})(;q=(?#{QVALUE}))?\z/ + + Charset = Struct.new(:name, :q) do + def quality_factor + (q || 1.0).to_f + end + + def <=> other + other.quality_factor <=> self.quality_factor + end + end + + # Parse the `accept-charset` header value into a list of character sets. + # + # @returns [Array(Charset)] the list of character sets and their associated quality factors. + def charsets + self.map do |value| + if match = value.match(CHARSET) + Charset.new(match[:name], match[:q]) + else + raise ParseError.new("Could not parse character set: #{value.inspect}") + end + end + end + end + end + end +end diff --git a/lib/protocol/http/header/accept_encoding.rb b/lib/protocol/http/header/accept_encoding.rb new file mode 100644 index 0000000..898ea96 --- /dev/null +++ b/lib/protocol/http/header/accept_encoding.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require_relative "split" +require_relative "quoted_string" +require_relative "../error" + +module Protocol + module HTTP + module Header + # The `accept-encoding` header represents a list of encodings that the client can accept. + class AcceptEncoding < Split + ParseError = Class.new(Error) + + # https://tools.ietf.org/html/rfc7231#section-5.3.1 + QVALUE = /0(\.[0-9]{0,3})?|1(\.[0]{0,3})?/ + + # https://tools.ietf.org/html/rfc7231#section-5.3.4 + ENCODING = /\A(?#{TOKEN})(;q=(?#{QVALUE}))?\z/ + + Encoding = Struct.new(:name, :q) do + def quality_factor + (q || 1.0).to_f + end + + def <=> other + other.quality_factor <=> self.quality_factor + end + end + + # Parse the `accept-encoding` header value into a list of encodings. + # + # @returns [Array(Charset)] the list of character sets and their associated quality factors. + def encodings + self.map do |value| + if match = value.match(ENCODING) + Encoding.new(match[:name], match[:q]) + else + raise ParseError.new("Could not parse encoding: #{value.inspect}") + end + end + end + end + end + end +end diff --git a/lib/protocol/http/header/accept_language.rb b/lib/protocol/http/header/accept_language.rb new file mode 100644 index 0000000..b9e8b46 --- /dev/null +++ b/lib/protocol/http/header/accept_language.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require_relative "split" +require_relative "quoted_string" +require_relative "../error" + +module Protocol + module HTTP + module Header + # The `accept-language` header represents a list of languages that the client can accept. + class AcceptLanguage < Split + ParseError = Class.new(Error) + + # https://tools.ietf.org/html/rfc3066#section-2.1 + NAME = /\*|[A-Z]{1,8}(-[A-Z0-9]{1,8})*/i + + # https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.9 + QVALUE = /0(\.[0-9]{0,6})?|1(\.[0]{0,6})?/ + + # https://greenbytes.de/tech/webdav/rfc7231.html#quality.values + LANGUAGE = /\A(?#{NAME})(\s*;\s*q=(?#{QVALUE}))?\z/ + + Language = Struct.new(:name, :q) do + def quality_factor + (q || 1.0).to_f + end + + def <=> other + other.quality_factor <=> self.quality_factor + end + end + + # Parse the `accept-language` header value into a list of languages. + # + # @returns [Array(Charset)] the list of character sets and their associated quality factors. + def languages + self.map do |value| + if match = value.match(LANGUAGE) + Language.new(match[:name], match[:q]) + else + raise ParseError.new("Could not parse language: #{value.inspect}") + end + end + end + end + end + end +end diff --git a/lib/protocol/http/header/quoted_string.rb b/lib/protocol/http/header/quoted_string.rb new file mode 100644 index 0000000..25c3db3 --- /dev/null +++ b/lib/protocol/http/header/quoted_string.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +module Protocol + module HTTP + module Header + # According to https://tools.ietf.org/html/rfc7231#appendix-C + TOKEN = /[!#$%&'*+\-.^_`|~0-9A-Z]+/i + + QUOTED_STRING = /"(?:.(?!(?. It should already match the QUOTED_STRING pattern above by the parser. + def self.unquote(value, normalize_whitespace = true) + value = value[1...-1] + + value.gsub!(/\\(.)/, '\1') + + if normalize_whitespace + # LWS = [CRLF] 1*( SP | HT ) + value.gsub!(/[\r\n]+\s+/, " ") + end + + return value + end + + QUOTES_REQUIRED = /[()<>@,;:\\"\/\[\]?={} \t]/ + + # Quote a string for HTTP header values if required. + # + # @raises [ArgumentError] if the value contains invalid characters like control characters or newlines. + def self.quote(value, force = false) + # Check if quoting is required: + if value =~ QUOTES_REQUIRED or force + "\"#{value.gsub(/["\\]/, '\\\\\0')}\"" + else + value + end + end + end + end + end +end diff --git a/lib/protocol/http/header/split.rb b/lib/protocol/http/header/split.rb index 0f1694e..4ecb1f1 100644 --- a/lib/protocol/http/header/split.rb +++ b/lib/protocol/http/header/split.rb @@ -30,7 +30,7 @@ def initialize(value = nil) # # @parameter value [String] the value or values to add, separated by commas. def << value - self.push(*value.split(COMMA)) + self.concat(value.split(COMMA)) end # Serializes the stored values into a comma-separated string. diff --git a/lib/protocol/http/headers.rb b/lib/protocol/http/headers.rb index c91e11e..3eae025 100644 --- a/lib/protocol/http/headers.rb +++ b/lib/protocol/http/headers.rb @@ -1,10 +1,11 @@ # frozen_string_literal: true # Released under the MIT License. -# Copyright, 2018-2024, by Samuel Williams. +# Copyright, 2018-2025, by Samuel Williams. require_relative "header/split" require_relative "header/multiple" + require_relative "header/cookie" require_relative "header/connection" require_relative "header/cache_control" @@ -15,6 +16,11 @@ require_relative "header/date" require_relative "header/priority" +require_relative "header/accept" +require_relative "header/accept_charset" +require_relative "header/accept_encoding" +require_relative "header/accept_language" + module Protocol module HTTP # @namespace @@ -277,6 +283,12 @@ def []= key, value "last-modified" => Header::Date, "if-modified-since" => Header::Date, "if-unmodified-since" => Header::Date, + + # Accept headers: + "accept" => Header::Accept, + "accept-charset" => Header::AcceptCharset, + "accept-encoding" => Header::AcceptEncoding, + "accept-language" => Header::AcceptLanguage, }.tap{|hash| hash.default = Split} # Delete all header values for the given key, and return the merged value. diff --git a/releases.md b/releases.md index bc42b28..70b5364 100644 --- a/releases.md +++ b/releases.md @@ -1,5 +1,9 @@ # Releases +## Unreleased + + - Add support for parsing `accept`, `accept-charset`, `accept-encoding` and `accept-language` headers into structured values. + ## v0.46.0 - Add support for `priority:` header. diff --git a/test/protocol/http/header/accept.rb b/test/protocol/http/header/accept.rb new file mode 100644 index 0000000..db41bf2 --- /dev/null +++ b/test/protocol/http/header/accept.rb @@ -0,0 +1,98 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require "protocol/http/header/accept" + +describe Protocol::HTTP::Header::Accept::MediaRange do + it "should have default quality_factor of 1.0" do + media_range = subject.new("text/plain", nil) + expect(media_range.quality_factor).to be == 1.0 + end + + with "#===" do + let(:media_range) {subject.new("text", "plain")} + + it "can compare with bare string" do + expect(media_range).to be === "text/plain" + end + + it "can compare with media range" do + expect(media_range).to be === media_range + end + end + + with "#to_s" do + it "can convert to string" do + media_range = subject.new("text", "plain", {"q" => "0.5"}) + expect(media_range.to_s).to be == "text/plain;q=0.5" + end + end + + with "#split" do + it "can split media range" do + media_range = subject.new("text", "plain", {"q" => "0.5"}) + type, subtype = media_range.split + expect(type).to be == "text" + expect(subtype).to be == "plain" + end + end +end + +describe Protocol::HTTP::Header::Accept do + let(:header) {subject.new(description)} + let(:media_ranges) {header.media_ranges.sort} + + with "text/plain, text/html;q=0.5, text/xml;q=0.25" do + it "can parse media ranges" do + expect(header.length).to be == 3 + + expect(media_ranges[0].range_string).to be == "text/plain" + expect(media_ranges[0].quality_factor).to be == 1.0 + + expect(media_ranges[1].range_string).to be == "text/html" + expect(media_ranges[1].quality_factor).to be == 0.5 + + expect(media_ranges[2].range_string).to be == "text/xml" + end + + it "can convert to string" do + expect(header.to_s).to be == "text/plain,text/html;q=0.5,text/xml;q=0.25" + end + end + + with "foobar" do + it "fails to parse" do + expect{media_ranges}.to raise_exception(Protocol::HTTP::Header::Accept::ParseError) + end + end + + with "text/html;q=0.25, text/xml;q=0.5, text/plain" do + it "should order based on quality factor" do + expect(media_ranges.collect(&:range_string)).to be == %w{text/plain text/xml text/html} + end + end + + with "text/html, text/plain;q=0.8, text/xml;q=0.6, application/json" do + it "should order based on quality factor" do + expect(media_ranges.collect(&:range_string)).to be == %w{text/html application/json text/plain text/xml} + end + end + + with "*/*;q=0" do + it "should accept wildcard media range" do + expect(media_ranges[0].range_string).to be == "*/*" + expect(media_ranges[0].quality_factor).to be == 0 + end + end + + with "text/html;schema=\"example.org\";q=0.5" do + it "should parse parameters" do + expect(media_ranges[0].parameters).to have_keys( + "schema" => be == "example.org", + "q" => be == "0.5", + ) + end + end +end diff --git a/test/protocol/http/header/accept_charset.rb b/test/protocol/http/header/accept_charset.rb new file mode 100644 index 0000000..9de461a --- /dev/null +++ b/test/protocol/http/header/accept_charset.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require "protocol/http/header/accept_charset" + +describe Protocol::HTTP::Header::AcceptCharset::Charset do + it "should have default quality_factor of 1.0" do + charset = subject.new("utf-8", nil) + expect(charset.quality_factor).to be == 1.0 + end +end + +describe Protocol::HTTP::Header::AcceptCharset do + let(:header) {subject.new(description)} + let(:charsets) {header.charsets.sort} + + with "utf-8, iso-8859-1;q=0.5, windows-1252;q=0.25" do + it "can parse charsets" do + expect(header.length).to be == 3 + + expect(charsets[0].name).to be == "utf-8" + expect(charsets[0].quality_factor).to be == 1.0 + + expect(charsets[1].name).to be == "iso-8859-1" + expect(charsets[1].quality_factor).to be == 0.5 + + expect(charsets[2].name).to be == "windows-1252" + expect(charsets[2].quality_factor).to be == 0.25 + end + end + + with "windows-1252;q=0.25, iso-8859-1;q=0.5, utf-8" do + it "should order based on quality factor" do + expect(charsets.collect(&:name)).to be == %w{utf-8 iso-8859-1 windows-1252} + end + end + + with "us-ascii,iso-8859-1;q=0.8,windows-1252;q=0.6,utf-8" do + it "should order based on quality factor" do + expect(charsets.collect(&:name)).to be == %w{us-ascii utf-8 iso-8859-1 windows-1252} + end + end + + with "*;q=0" do + it "should accept wildcard charset" do + expect(charsets[0].name).to be == "*" + expect(charsets[0].quality_factor).to be == 0 + end + end + + with "utf-8, iso-8859-1;q=0.5, windows-1252;q=0.5" do + it "should preserve relative order" do + expect(charsets[0].name).to be == "utf-8" + expect(charsets[1].name).to be == "iso-8859-1" + expect(charsets[2].name).to be == "windows-1252" + end + end + + it "should not accept invalid input" do + bad_values = [ + # Invalid quality factor: + "utf-8;f=1", + + # Invalid parameter: + "us-ascii;utf-8", + + # Invalid use of separator: + ";", + + # Empty charset (we ignore this one): + # "," + ] + + bad_values.each do |value| + expect{subject.new(value).charsets}.to raise_exception(subject::ParseError) + end + end +end diff --git a/test/protocol/http/header/accept_encoding.rb b/test/protocol/http/header/accept_encoding.rb new file mode 100644 index 0000000..5255c19 --- /dev/null +++ b/test/protocol/http/header/accept_encoding.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require "protocol/http/header/accept_encoding" + +describe Protocol::HTTP::Header::AcceptEncoding::Encoding do + it "should have default quality_factor of 1.0" do + encoding = subject.new("utf-8", nil) + expect(encoding.quality_factor).to be == 1.0 + end +end + +describe Protocol::HTTP::Header::AcceptEncoding do + let(:header) {subject.new(description)} + let(:encodings) {header.encodings.sort} + + with "gzip, deflate;q=0.5, identity;q=0.25" do + it "can parse charsets" do + expect(header.length).to be == 3 + + expect(encodings[0].name).to be == "gzip" + expect(encodings[0].quality_factor).to be == 1.0 + + expect(encodings[1].name).to be == "deflate" + expect(encodings[1].quality_factor).to be == 0.5 + + expect(encodings[2].name).to be == "identity" + expect(encodings[2].quality_factor).to be == 0.25 + end + end + + with "identity;q=0.25, deflate;q=0.5, gzip" do + it "should order based on quality factor" do + expect(encodings.collect(&:name)).to be == %w{gzip deflate identity} + end + end + + with "br,deflate;q=0.8,identity;q=0.6,gzip" do + it "should order based on quality factor" do + expect(encodings.collect(&:name)).to be == %w{br gzip deflate identity} + end + end + + with "*;q=0" do + it "should accept wildcard encoding" do + expect(encodings[0].name).to be == "*" + expect(encodings[0].quality_factor).to be == 0 + end + end + + with "br, gzip;q=0.5, deflate;q=0.5" do + it "should preserve relative order" do + expect(encodings[0].name).to be == "br" + expect(encodings[1].name).to be == "gzip" + expect(encodings[2].name).to be == "deflate" + end + end + + it "should not accept invalid input" do + bad_values = [ + # Invalid quality factor: + "br;f=1", + + # Invalid parameter: + "br;gzip", + + # Invalid use of separator: + ";", + + # Empty (we ignore this one): + # "," + ] + + bad_values.each do |value| + expect{subject.new(value).encodings}.to raise_exception(subject::ParseError) + end + end +end diff --git a/test/protocol/http/header/accept_language.rb b/test/protocol/http/header/accept_language.rb new file mode 100644 index 0000000..2b9b243 --- /dev/null +++ b/test/protocol/http/header/accept_language.rb @@ -0,0 +1,95 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2025, by Samuel Williams. + +require "protocol/http/header/accept_language" + +describe Protocol::HTTP::Header::AcceptLanguage::Language do + it "should have default quality_factor of 1.0" do + language = subject.new("utf-8", nil) + expect(language.quality_factor).to be == 1.0 + end +end + +describe Protocol::HTTP::Header::AcceptLanguage do + let(:header) {subject.new(description)} + let(:languages) {header.languages.sort} + + with "da, en-gb;q=0.5, en;q=0.25" do + it "can parse languages" do + expect(header.length).to be == 3 + + expect(languages[0].name).to be == "da" + expect(languages[0].quality_factor).to be == 1.0 + + expect(languages[1].name).to be == "en-gb" + expect(languages[1].quality_factor).to be == 0.5 + + expect(languages[2].name).to be == "en" + expect(languages[2].quality_factor).to be == 0.25 + end + end + + with "en-gb;q=0.25, en;q=0.5, en-us" do + it "should order based on quality factor" do + expect(languages.collect(&:name)).to be == %w{en-us en en-gb} + end + end + + with "en-us,en-gb;q=0.8,en;q=0.6,es-419" do + it "should order based on quality factor" do + expect(languages.collect(&:name)).to be == %w{en-us es-419 en-gb en} + end + end + + with "*;q=0" do + it "should accept wildcard language" do + expect(languages[0].name).to be == "*" + expect(languages[0].quality_factor).to be == 0 + end + end + + with "en, de;q=0.5, jp;q=0.5" do + it "should preserve relative order" do + expect(languages[0].name).to be == "en" + expect(languages[1].name).to be == "de" + expect(languages[2].name).to be == "jp" + end + end + + with "de, en-US; q=0.7, en ; q=0.3" do + it "should parse with optional whitespace" do + expect(languages[0].name).to be == "de" + expect(languages[1].name).to be == "en-US" + expect(languages[2].name).to be == "en" + end + end + + with "en;q=0.123456" do + it "accepts quality factors with up to 6 decimal places" do + expect(languages[0].name).to be == "en" + expect(languages[0].quality_factor).to be == 0.123456 + end + end + + it "should not accept invalid input" do + bad_values = [ + # Invalid quality factor: + "en;f=1", + + # Invalid parameter: + "de;fr", + + # Invalid use of separator: + ";", + + # Empty (we ignore this one): + # "," + ] + + bad_values.each do |value| + expect{subject.new(value).languages}.to raise_exception(subject::ParseError) + end + end +end diff --git a/test/protocol/http/header/quoted_string.rb b/test/protocol/http/header/quoted_string.rb new file mode 100644 index 0000000..ae6f7bb --- /dev/null +++ b/test/protocol/http/header/quoted_string.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +# Released under the MIT License. +# Copyright, 2016-2024, by Samuel Williams. + +require "protocol/http/header/quoted_string" + +describe Protocol::HTTP::Header::QuotedString do + with ".unquote" do + it "ignores linear whitespace" do + quoted_string = subject.unquote(%Q{"Hello\r\n World"}) + + expect(quoted_string).to be == "Hello World" + end + end + + with ".quote" do + it "doesn't quote a string that has no special characters" do + quoted_string = subject.quote("Hello") + + expect(quoted_string).to be == "Hello" + end + + it "quotes a string with a space" do + quoted_string = subject.quote("Hello World") + + expect(quoted_string).to be == %Q{"Hello World"} + end + + it "quotes a string with a double quote" do + quoted_string = subject.quote(%Q{Hello "World"}) + + expect(quoted_string).to be == %Q{"Hello \\"World\\""} + end + + it "quotes a string with a backslash" do + quoted_string = subject.quote(%Q{Hello \\World}) + + expect(quoted_string).to be == %Q{"Hello \\\\World"} + end + end +end