Skip to content

Commit

Permalink
Granular features: linking syllables count feature to pipeline, and u…
Browse files Browse the repository at this point in the history
…pdating the acceptance tests for the syllables count feature
  • Loading branch information
neomatrix369 committed May 14, 2021
1 parent b99b08e commit 498338e
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 34 deletions.
1 change: 1 addition & 0 deletions nlp_profiler/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,6 @@
CHARACTERS_COUNT_COL = 'characters_count'
ENGLISH_CHARACTERS_COUNT_COL = 'english_characters_count'
NON_ENGLISH_CHARACTERS_COUNT_COL = 'non_english_characters_count'
SYLLABLES_COUNT_COL = 'syllables_count'
SENTENCES_COUNT_COL = 'sentences_count'
NOUN_PHASE_COUNT_COL = 'noun_phase_count'
7 changes: 5 additions & 2 deletions nlp_profiler/granular_features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
REPEATED_SPACES_COUNT_COL, WHITESPACES_COUNT_COL, CHARS_EXCL_WHITESPACES_COUNT_COL, \
REPEATED_WHITESPACES_COUNT_COL, ALPHA_NUMERIC_COUNT_COL, REPEATED_LETTERS_COUNT_COL, \
WHOLE_NUMBERS_COUNT_COL, REPEATED_DIGITS_COUNT_COL, EMOJI_COUNT_COL, \
NOUN_PHASE_COUNT_COL, ENGLISH_CHARACTERS_COUNT_COL, NON_ENGLISH_CHARACTERS_COUNT_COL
NOUN_PHASE_COUNT_COL, ENGLISH_CHARACTERS_COUNT_COL, NON_ENGLISH_CHARACTERS_COUNT_COL, \
SYLLABLES_COUNT_COL
from nlp_profiler.generate_features import generate_features
from nlp_profiler.granular_features.alphanumeric import count_alpha_numeric
from nlp_profiler.granular_features.chars_spaces_and_whitespaces \
Expand All @@ -28,6 +29,7 @@
from nlp_profiler.granular_features.letters import count_repeated_letters
from nlp_profiler.granular_features.non_alphanumeric import count_non_alpha_numeric
from nlp_profiler.granular_features.noun_phase_count import count_noun_phase
from nlp_profiler.granular_features.syllables import count_syllables
from nlp_profiler.granular_features.numbers import count_whole_numbers, count_repeated_digits
from nlp_profiler.granular_features.punctuations import count_punctuations, count_repeated_punctuations
from nlp_profiler.granular_features.sentences import count_sentences
Expand Down Expand Up @@ -62,7 +64,8 @@ def apply_granular_features(heading: str,
(DATES_COUNT_COL, text_column, count_dates),
(NOUN_PHASE_COUNT_COL, text_column, count_noun_phase),
(ENGLISH_CHARACTERS_COUNT_COL, text_column, count_english_chars),
(NON_ENGLISH_CHARACTERS_COUNT_COL, text_column, count_non_english_chars)
(NON_ENGLISH_CHARACTERS_COUNT_COL, text_column, count_non_english_chars),
(SYLLABLES_COUNT_COL, text_column, count_syllables)
]
generate_features(
heading, steps_for_features,
Expand Down
32 changes: 16 additions & 16 deletions tests/acceptance_tests/data/expected_profiled_dataframe.csv
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count,sentiment_polarity_score,sentiment_polarity,sentiment_polarity_summarised,sentiment_subjectivity_score,sentiment_subjectivity,sentiment_subjectivity_summarised,spelling_quality_score,spelling_quality,spelling_quality_summarised,ease_of_reading_score,ease_of_reading_quality,ease_of_reading_summarised
I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2,0.38,Pretty positive,Positive,0.43,Objective/subjective,Objective/subjective,1.0,Very good,Good,116.15,Very Easy,Easy
2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0,-0.10681818181818181,Pretty negative,Negative,0.55,Objective/subjective,Objective/subjective,1.0,Very good,Good,107.69,Very Easy,Easy
2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,105.66,Very Easy,Easy
Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0,-0.2916666666666667,Pretty negative,Negative,0.5416666666666666,Objective/subjective,Objective/subjective,0.6923076923076923,Bad,Bad,91.27,Very Easy,Easy
283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.8571428571428572,Bad,Bad,106.67,Very Easy,Easy
"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0,0.375,Pretty positive,Positive,0.75,Pretty subjective,Subjective,0.9444444444444444,Pretty good,Good,66.74,Standard,Standard
283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.95,Good,Good,89.75,Easy,Easy
"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy
The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy
Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0,0.03888888888888886,Pretty positive,Positive,0.5611111111111111,Objective/subjective,Objective/subjective,1.0,Very good,Good,100.24,Very Easy,Easy
283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,56.93,Fairly Difficult,Difficult
text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count,syllables_count,sentiment_polarity_score,sentiment_polarity,sentiment_polarity_summarised,sentiment_subjectivity_score,sentiment_subjectivity,sentiment_subjectivity_summarised,spelling_quality_score,spelling_quality,spelling_quality_summarised,ease_of_reading_score,ease_of_reading_quality,ease_of_reading_summarised
I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2,6,0.38,Pretty positive,Positive,0.43,Objective/subjective,Objective/subjective,1.0,Very good,Good,116.15,Very Easy,Easy
2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0,13,-0.10681818181818181,Pretty negative,Negative,0.55,Objective/subjective,Objective/subjective,1.0,Very good,Good,107.69,Very Easy,Easy
2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0,9,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,105.66,Very Easy,Easy
Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0,14,-0.2916666666666667,Pretty negative,Negative,0.5416666666666666,Objective/subjective,Objective/subjective,0.6923076923076923,Bad,Bad,91.27,Very Easy,Easy
283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0,8,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.8571428571428572,Bad,Bad,106.67,Very Easy,Easy
"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0,19,0.375,Pretty positive,Positive,0.75,Pretty subjective,Subjective,0.9444444444444444,Pretty good,Good,66.74,Standard,Standard
283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0,9,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.95,Good,Good,89.75,Easy,Easy
"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0,13,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy
The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0,13,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy
Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0,18,0.03888888888888886,Pretty positive,Positive,0.5611111111111111,Objective/subjective,Objective/subjective,1.0,Very good,Good,100.24,Very Easy,Easy
283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0,10,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,56.93,Fairly Difficult,Difficult
"2833047 pe
ople li ve in this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8888888888888888,Bad,Bad,130.02,Very Easy,Easy
ople li ve in this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0,7,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8888888888888888,Bad,Bad,130.02,Very Easy,Easy
"2833047 people

live in th
is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,63.36,Standard,Standard
is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0,13,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,63.36,Standard,Standard
"±§£ABCDEabcdef0123456789
is are
!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8928571428571429,Bad,Bad,93.81,Very Easy,Easy
!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1,4,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8928571428571429,Bad,Bad,93.81,Very Easy,Easy
"2833047 pe
ople li ve in this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.7435897435897436,Bad,Bad,69.45,Standard,Standard
ople li ve in this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155,34,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.7435897435897436,Bad,Bad,69.45,Standard,Standard
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count
I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2
2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0
2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0
Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0
283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0
"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0
283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0
"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0
The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0
Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0
283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0
text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count,syllables_count
I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2,6
2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0,13
2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0,9
Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0,14
283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0,8
"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0,19
283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0,9
"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0,13
The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0,13
Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0,18
283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0,10
"2833047 pe
ople li ve in this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0
ople li ve in this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0,7
"2833047 people

live in th
is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0
is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0,13
"±§£ABCDEabcdef0123456789
is are
!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1
!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1,4
"2833047 pe
ople li ve in this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155
ople li ve in this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155,34
Expand Down

0 comments on commit 498338e

Please sign in to comment.